Spaces:

wwydmanski
/

tesseract-ocr

Runtime error

tesseract-ocr / app.py

Witold Wydmański

feat: replace pytesseract with tesserocr

1506ae7 almost 3 years ago

2.21 kB

	import gradio as gr
	import tempfile
	import tesserocr
	import os
	import fitz # PyMuPDF, imported as fitz for backward compatibility reasons
	from PIL import Image
	import logging
	from multiprocessing.pool import Pool

	logging.basicConfig(level=logging.INFO)

	APIs = {
	"pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"),
	"eng": tesserocr.PyTessBaseAPI(),
	}

	def pdf_to_image(pdf_file, path, progress, max_pages):
	# Convert the PDF to a PNG image using pdf2image
	doc = fitz.open(pdf_file.name) # open document
	fnames = []
	idx = 1
	total = len(doc) if max_pages == 0 else max_pages
	for page in progress.tqdm(doc, desc="Converting PDF to image", total=total):
	pix = page.get_pixmap()
	output = f"{path}/page-{idx}.png"
	pix.save(output)
	fnames.append(output)
	idx += 1
	if max_pages > 0 and idx > max_pages:
	break
	return fnames

	def tesseract_ocr(image, language, max_pages, progress=gr.Progress()):
	api = APIs[language]

	# Run OCR on the image using Tesseract
	with tempfile.TemporaryDirectory() as path:
	images = pdf_to_image(image, path, progress, max_pages)
	text_res = []
	for img in progress.tqdm(images, desc="Running OCR"):
	with open(img, 'rb') as f:
	img = Image.open(f)
	img.load()
	api.SetImage(img)
	text = api.GetUTF8Text()
	text_res.append(text)

	with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file:
	file.write("\n".join(text_res))
	return file.name


	if __name__ == "__main__":
	logging.info("Starting Tesseract OCR")
	iface = gr.Interface(
	fn=tesseract_ocr,
	inputs=[
	gr.File(label="PDF file"),
	gr.Dropdown(["eng", "pol"], label="Language", value="eng"),
	gr.Number(label="Number of pages", value=0)
	],
	outputs=gr.File(label="Text file", type="file"),
	title="PDF to Text Converter",
	description="Converts a PDF file to text using Tesseract OCR."
	).queue(concurrency_count=10)

	iface.launch(server_port=7860, server_name="0.0.0.0")