Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import tempfile | |
| import tesserocr | |
| import os | |
| import fitz # PyMuPDF, imported as fitz for backward compatibility reasons | |
| from PIL import Image | |
| import logging | |
| from multiprocessing.pool import Pool | |
| logging.basicConfig(level=logging.INFO) | |
| APIs = { | |
| "pol": tesserocr.PyTessBaseAPI(lang="pol", path="./tessdata"), | |
| "eng": tesserocr.PyTessBaseAPI(), | |
| } | |
| def pdf_to_image(pdf_file, path, progress, max_pages): | |
| # Convert the PDF to a PNG image using pdf2image | |
| doc = fitz.open(pdf_file.name) # open document | |
| fnames = [] | |
| idx = 1 | |
| total = len(doc) if max_pages == 0 else max_pages | |
| for page in progress.tqdm(doc, desc="Converting PDF to image", total=total): | |
| pix = page.get_pixmap() | |
| output = f"{path}/page-{idx}.png" | |
| pix.save(output) | |
| fnames.append(output) | |
| idx += 1 | |
| if max_pages > 0 and idx > max_pages: | |
| break | |
| return fnames | |
| def tesseract_ocr(image, language, max_pages, progress=gr.Progress()): | |
| api = APIs[language] | |
| # Run OCR on the image using Tesseract | |
| with tempfile.TemporaryDirectory() as path: | |
| images = pdf_to_image(image, path, progress, max_pages) | |
| text_res = [] | |
| for img in progress.tqdm(images, desc="Running OCR"): | |
| with open(img, 'rb') as f: | |
| img = Image.open(f) | |
| img.load() | |
| api.SetImage(img) | |
| text = api.GetUTF8Text() | |
| text_res.append(text) | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as file: | |
| file.write("\n".join(text_res)) | |
| return file.name | |
| if __name__ == "__main__": | |
| logging.info("Starting Tesseract OCR") | |
| iface = gr.Interface( | |
| fn=tesseract_ocr, | |
| inputs=[ | |
| gr.File(label="PDF file"), | |
| gr.Dropdown(["eng", "pol"], label="Language", value="eng"), | |
| gr.Number(label="Number of pages", value=0) | |
| ], | |
| outputs=gr.File(label="Text file", type="file"), | |
| title="PDF to Text Converter", | |
| description="Converts a PDF file to text using Tesseract OCR." | |
| ).queue(concurrency_count=10) | |
| iface.launch(server_port=7860, server_name="0.0.0.0") |