Spaces:

ahmedzein
/

tableocr

Running

App Files Files Community

tableocr / helper.py

ahmedzein

Upload 7 files

c6a18bd verified 8 months ago

raw

history blame contribute delete

7.78 kB

	import torch

	from tqdm.auto import tqdm

	import matplotlib.pyplot as plt
	import matplotlib.patches as patches
	from matplotlib.patches import Patch

	import numpy as np
	from PIL import Image

	GREEN = "\033[92m"
	RESET = "\033[0m"

	class MaxResize(object):
	def __init__(self, max_size=800):
	self.max_size = max_size

	def __call__(self, image):
	width, height = image.size
	current_max_size = max(width, height)
	scale = self.max_size / current_max_size
	resized_image = image.resize((int(round(scalewidth)), int(round(scaleheight))))

	return resized_image

	# for output bounding box post-processing
	def box_cxcywh_to_xyxy(x):
	x_c, y_c, w, h = x.unbind(-1)
	b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
	return torch.stack(b, dim=1)


	def rescale_bboxes(out_bbox, size):
	img_w, img_h = size
	b = box_cxcywh_to_xyxy(out_bbox)
	b = b * torch.tensor([img_w, img_h, img_w, img_h], dtype=torch.float32)
	return b

	def outputs_to_objects(outputs, img_size, id2label):
	m = outputs.logits.softmax(-1).max(-1)
	pred_labels = list(m.indices.detach().cpu().numpy())[0]
	pred_scores = list(m.values.detach().cpu().numpy())[0]
	pred_bboxes = outputs['pred_boxes'].detach().cpu()[0]
	pred_bboxes = [elem.tolist() for elem in rescale_bboxes(pred_bboxes, img_size)]

	objects = []
	for label, score, bbox in zip(pred_labels, pred_scores, pred_bboxes):
	class_label = id2label[int(label)]
	if not class_label == 'no object':
	objects.append({'label': class_label, 'score': float(score),
	'bbox': [float(elem) for elem in bbox]})

	return objects


	def fig2img(fig):
	"""Convert a Matplotlib figure to a PIL Image and return it"""
	import io
	buf = io.BytesIO()
	fig.savefig(buf)
	buf.seek(0)
	img = Image.open(buf)
	return img


	def visualize_detected_tables(img, det_tables, out_path=None):
	plt.imshow(img, interpolation="lanczos")
	fig = plt.gcf()
	fig.set_size_inches(20, 20)
	ax = plt.gca()

	for det_table in det_tables:
	bbox = det_table['bbox']

	if det_table['label'] == 'table':
	facecolor = (1, 0, 0.45)
	edgecolor = (1, 0, 0.45)
	alpha = 0.3
	linewidth = 2
	hatch='//////'
	elif det_table['label'] == 'table rotated':
	facecolor = (0.95, 0.6, 0.1)
	edgecolor = (0.95, 0.6, 0.1)
	alpha = 0.3
	linewidth = 2
	hatch='//////'
	else:
	continue

	rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth,
	edgecolor='none',facecolor=facecolor, alpha=0.1)
	ax.add_patch(rect)
	rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=linewidth,
	edgecolor=edgecolor,facecolor='none',linestyle='-', alpha=alpha)
	ax.add_patch(rect)
	rect = patches.Rectangle(bbox[:2], bbox[2]-bbox[0], bbox[3]-bbox[1], linewidth=0,
	edgecolor=edgecolor,facecolor='none',linestyle='-', hatch=hatch, alpha=0.2)
	ax.add_patch(rect)

	plt.xticks([], [])
	plt.yticks([], [])

	legend_elements = [Patch(facecolor=(1, 0, 0.45), edgecolor=(1, 0, 0.45),
	label='Table', hatch='//////', alpha=0.3),
	Patch(facecolor=(0.95, 0.6, 0.1), edgecolor=(0.95, 0.6, 0.1),
	label='Table (rotated)', hatch='//////', alpha=0.3)]
	plt.legend(handles=legend_elements, bbox_to_anchor=(0.5, -0.02), loc='upper center', borderaxespad=0,
	fontsize=10, ncol=2)
	plt.gcf().set_size_inches(10, 10)
	plt.axis('off')

	if out_path is not None:
	plt.savefig(out_path, bbox_inches='tight', dpi=150)

	return fig

	def objects_to_crops(img, tokens, objects, class_thresholds, padding=10):
	"""
	Process the bounding boxes produced by the table detection model into
	cropped table images and cropped tokens.
	"""

	table_crops = []
	for obj in objects:
	if obj['score'] < class_thresholds[obj['label']]:
	continue

	cropped_table = {}

	bbox = obj['bbox']
	bbox = [bbox[0]-padding, bbox[1]-padding, bbox[2]+padding, bbox[3]+padding]

	cropped_img = img.crop(bbox)

	table_tokens = [token for token in tokens if iob(token['bbox'], bbox) >= 0.5]
	for token in table_tokens:
	token['bbox'] = [token['bbox'][0]-bbox[0],
	token['bbox'][1]-bbox[1],
	token['bbox'][2]-bbox[0],
	token['bbox'][3]-bbox[1]]

	# If table is predicted to be rotated, rotate cropped image and tokens/words:
	if obj['label'] == 'table rotated':
	cropped_img = cropped_img.rotate(270, expand=True)
	for token in table_tokens:
	bbox = token['bbox']
	bbox = [cropped_img.size[0]-bbox[3]-1,
	bbox[0],
	cropped_img.size[0]-bbox[1]-1,
	bbox[2]]
	token['bbox'] = bbox

	cropped_table['image'] = cropped_img
	cropped_table['tokens'] = table_tokens

	table_crops.append(cropped_table)

	return table_crops


	def get_cell_coordinates_by_row(table_data):
	# Extract rows and columns
	rows = [entry for entry in table_data if entry['label'] == 'table row']
	columns = [entry for entry in table_data if entry['label'] == 'table column']

	# Sort rows and columns by their Y and X coordinates, respectively
	rows.sort(key=lambda x: x['bbox'][1])
	columns.sort(key=lambda x: x['bbox'][0])

	# Function to find cell coordinates
	def find_cell_coordinates(row, column):
	cell_bbox = [column['bbox'][0], row['bbox'][1], column['bbox'][2], row['bbox'][3]]
	return cell_bbox

	# Generate cell coordinates and count cells in each row
	cell_coordinates = []

	for row in rows:
	row_cells = []
	for column in columns:
	cell_bbox = find_cell_coordinates(row, column)
	row_cells.append({'column': column['bbox'], 'cell': cell_bbox})

	# Sort cells in the row by X coordinate
	row_cells.sort(key=lambda x: x['column'][0])

	# Append row information to cell_coordinates
	cell_coordinates.append({'row': row['bbox'], 'cells': row_cells, 'cell_count': len(row_cells)})

	# Sort rows from top to bottom
	cell_coordinates.sort(key=lambda x: x['row'][1])

	return cell_coordinates

	def apply_ocr(cell_coordinates, cropped_table, reader):
	# let's OCR row by row
	data = dict()
	max_num_columns = 0
	for idx, row in enumerate(tqdm(cell_coordinates)):
	row_text = []
	for cell in row["cells"]:
	# crop cell out of image
	cell_image = np.array(cropped_table.crop(cell["cell"]))
	# apply OCR
	result = reader.readtext(np.array(cell_image))
	if len(result) > 0:
	# print([x[1] for x in list(result)])
	text = " ".join([x[1] for x in result])
	row_text.append(text)

	if len(row_text) > max_num_columns:
	max_num_columns = len(row_text)

	data[idx] = row_text

	# print("Max number of columns:", max_num_columns)

	# pad rows which don't have max_num_columns elements
	# to make sure all rows have the same number of columns
	for row, row_data in data.copy().items():
	if len(row_data) != max_num_columns:
	row_data = row_data + ["" for _ in range(max_num_columns - len(row_data))]
	data[row] = row_data

	return data