Spaces:

satyakimitra
/

text_auth_ai

Sleeping

App Files Files Community

text_auth_ai / processors /document_extractor.py

satyakimitra

Repo updated

8205d6b about 2 months ago

raw

history blame contribute delete

34.1 kB

	# DEPENDENCIES
	import io
	import os
	import mimetypes
	from typing import Any
	from typing import Dict
	from typing import List
	from pathlib import Path
	from typing import Tuple
	from loguru import logger
	from typing import Optional
	from dataclasses import dataclass


	# Document processing libraries
	try:
	import PyPDF2
	import pdfplumber
	PDF_AVAILABLE = True
	except ImportError:
	logger.warning("PDF libraries not available. Install: pip install PyPDF2 pdfplumber")
	PDF_AVAILABLE = False

	try:
	from docx import Document as DocxDocument
	DOCX_AVAILABLE = True
	except ImportError:
	logger.warning("python-docx not available. Install: pip install python-docx")
	DOCX_AVAILABLE = False

	try:
	import chardet
	CHARDET_AVAILABLE = True
	except ImportError:
	logger.warning("chardet not available. Install: pip install chardet")
	CHARDET_AVAILABLE = False

	try:
	from bs4 import BeautifulSoup
	BS4_AVAILABLE = True
	except ImportError:
	logger.warning("BeautifulSoup not available. Install: pip install beautifulsoup4")
	BS4_AVAILABLE = False


	@dataclass
	class ExtractedDocument:
	"""
	Container for extracted document content with metadata
	"""
	text : str
	file_path : Optional[str]
	file_type : str
	file_size_bytes : int
	page_count : int
	extraction_method : str
	metadata : Dict[str, Any]
	is_success : bool
	error_message : Optional[str]
	warnings : List[str]


	def to_dict(self) -> Dict[str, Any]:
	"""
	Convert to dictionary for JSON serialization
	"""
	return {"text_length" : len(self.text),
	"file_type" : self.file_type,
	"file_size_bytes" : self.file_size_bytes,
	"page_count" : self.page_count,
	"extraction_method" : self.extraction_method,
	"metadata" : self.metadata,
	"is_success" : self.is_success,
	"error_message" : self.error_message,
	"warnings" : self.warnings,
	}


	class DocumentExtractor:
	"""
	Extracts text from various document formats for AI detection processing

	Supported Formats:
	- Plain text (.txt, .md, .log)
	- PDF documents (.pdf)
	- Microsoft Word (.doc, .docx)
	- Rich Text Format (.rtf)
	- HTML files (.html, .htm)

	Features:
	- Robust error handling
	- Encoding detection
	- Metadata extraction
	- Page/section preservation
	- Memory-efficient processing
	"""

	# Supported file extensions
	SUPPORTED_EXTENSIONS = {'.txt', '.text', '.md', '.markdown', '.log', '.csv', '.pdf', '.docx', '.doc', '.rtf', '.html', '.htm'}

	# Text file extensions
	TEXT_EXTENSIONS = {'.txt', '.text', '.md', '.markdown', '.log', '.csv'}

	# Maximum file size (50 MB default)
	MAX_FILE_SIZE = 50 * 1024 * 1024


	def __init__(self, max_file_size: int = MAX_FILE_SIZE, prefer_pdfplumber: bool = True, extract_metadata: bool = True):
	"""
	Initialize document extractor

	Arguments:
	----------
	max_file_size : Maximum file size in bytes

	prefer_pdfplumber : Use pdfplumber over PyPDF2 (better quality)

	extract_metadata : Extract document metadata
	"""
	self.max_file_size = max_file_size
	self.prefer_pdfplumber = prefer_pdfplumber
	self.extract_metadata = extract_metadata

	logger.info(f"DocumentExtractor initialized (max_size={max_file_size/1024/1024:.1f}MB)")


	def extract(self, file_path: str) -> ExtractedDocument:
	"""
	Extract text from document

	Arguments:
	----------
	file_path { str } : Path to the document file

	Returns:
	--------
	{ ExtractedDocument } : ExtractedDocument object with extracted text and metadata
	"""
	try:
	file_path = Path(file_path)

	# Validate file
	validation_result = self._validate_file(file_path)

	if not validation_result[0]:
	return self._create_error_result(file_path = str(file_path),
	error = validation_result[1],
	)

	# Get file info
	file_size = file_path.stat().st_size
	file_ext = file_path.suffix.lower()

	# Route to appropriate extractor
	if (file_ext in self.TEXT_EXTENSIONS):
	result = self._extract_text_file(file_path)

	elif (file_ext == '.pdf'):
	result = self._extract_pdf(file_path)

	elif (file_ext in {'.docx', '.doc'}):
	result = self._extract_word(file_path)

	elif (file_ext == '.rtf'):
	result = self._extract_rtf(file_path)

	elif (file_ext in {'.html', '.htm'}):
	result = self._extract_html(file_path)

	else:
	return self._create_error_result(file_path = str(file_path),
	error = f"Unsupported file type: {file_ext}",
	)

	# Add common metadata
	result.file_path = str(file_path)
	result.file_size_bytes = file_size

	logger.info(f"Extracted {len(result.text)} chars from {file_path.name}")
	return result

	except Exception as e:
	logger.error(f"Error extracting document: {repr(e)}")
	return self._create_error_result(file_path = str(file_path) if file_path else None,
	error = repr(e),
	)


	def extract_from_bytes(self, file_bytes: bytes, filename: str, mime_type: Optional[str] = None) -> ExtractedDocument:
	"""
	Extract text from bytes (for file uploads)

	Arguments:
	----------
	file_bytes : File content as bytes

	filename : Original filename

	mime_type : MIME type (optional)

	Returns:
	--------
	ExtractedDocument object
	"""
	try:
	# Determine file type
	file_ext = Path(filename).suffix.lower()

	if file_ext not in self.SUPPORTED_EXTENSIONS:
	return self._create_error_result(file_path = filename,
	error = f"Unsupported file type: {file_ext}",
	)

	# Check size
	if (len(file_bytes) > self.max_file_size):
	return self._create_error_result(file_path = filename,
	error = f"File too large: {len(file_bytes)/1024/1024:.1f}MB"
	)

	# Route to appropriate extractor
	if (file_ext in self.TEXT_EXTENSIONS):
	result = self._extract_text_bytes(file_bytes, filename)

	elif (file_ext == '.pdf'):
	result = self._extract_pdf_bytes(file_bytes, filename)

	elif (file_ext in {'.docx', '.doc'}):
	result = self._extract_word_bytes(file_bytes, filename)

	elif (file_ext == '.rtf'):
	result = self._extract_rtf_bytes(file_bytes, filename)

	elif (file_ext in {'.html', '.htm'}):
	result = self._extract_html_bytes(file_bytes, filename)

	else:
	return self._create_error_result(file_path = filename,
	error = f"Unsupported file type: {file_ext}"
	)

	result.file_path = filename
	result.file_size_bytes = len(file_bytes)

	return result

	except Exception as e:
	logger.error(f"Error extracting from bytes: {e}")
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _extract_text_file(self, file_path: Path) -> ExtractedDocument:
	"""
	Extract text from plain text files
	"""
	warnings = list()

	try:
	# Try to detect encoding
	encoding = 'utf-8'

	if CHARDET_AVAILABLE:
	with open(file_path, 'rb') as f:
	raw_data = f.read()
	detected = chardet.detect(raw_data)
	if detected['confidence'] > 0.7:
	encoding = detected['encoding']
	logger.debug(f"Detected encoding: {encoding} (confidence: {detected['confidence']})")

	# Read file with detected encoding
	try:
	with open(file_path, 'r', encoding=encoding) as f:
	text = f.read()

	except UnicodeDecodeError:
	# Fallback to latin-1 (never fails)
	warnings.append(f"Failed to decode with {encoding}, using latin-1")
	with open(file_path, 'r', encoding = 'latin-1') as f:
	text = f.read()

	return ExtractedDocument(text = text,
	file_path = str(file_path),
	file_type = file_path.suffix,
	file_size_bytes = file_path.stat().st_size,
	page_count = 1,
	extraction_method = 'plain_text',
	metadata = {'encoding': encoding},
	is_success = True,
	error_message = None,
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = str(file_path),
	error = repr(e),
	)


	def _extract_text_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
	"""
	Extract text from bytes
	"""
	warnings = list()

	try:
	# Detect encoding
	encoding = 'utf-8'

	if CHARDET_AVAILABLE:
	detected = chardet.detect(file_bytes)
	if (detected['confidence'] > 0.7):
	encoding = detected['encoding']

	# Decode
	try:
	text = file_bytes.decode(encoding)

	except UnicodeDecodeError:
	warnings.append(f"Failed to decode with {encoding}, using latin-1")
	text = file_bytes.decode('latin-1')

	return ExtractedDocument(text = text,
	file_path = filename,
	file_type = Path(filename).suffix,
	file_size_bytes = len(file_bytes),
	page_count = 1,
	extraction_method = 'plain_text',
	metadata = {'encoding': encoding},
	is_success = True,
	error_message = None,
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _extract_pdf(self, file_path: Path) -> ExtractedDocument:
	"""
	Extract text from PDF files
	"""
	if not PDF_AVAILABLE:
	return self._create_error_result(file_path = (file_path),
	error = "PDF libraries not installed",
	)

	warnings = list()
	text = ""
	page_count = 0
	metadata = dict()

	# Try pdfplumber first (better quality)
	if self.prefer_pdfplumber:
	try:
	with pdfplumber.open(file_path) as pdf:
	page_count = len(pdf.pages)
	metadata = pdf.metadata or {}

	for page in pdf.pages:
	page_text = page.extract_text()

	if page_text:
	text += page_text + "\n\n"

	if text.strip():
	return ExtractedDocument(text = text.strip(),
	file_path = str(file_path),
	file_type = '.pdf',
	file_size_bytes = file_path.stat().st_size,
	page_count = page_count,
	extraction_method = 'pdfplumber',
	metadata = metadata,
	is_success = True,
	error_message = None,
	warnings = warnings,
	)
	except Exception as e:
	warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")

	# Fallback to PyPDF2
	try:
	with open(file_path, 'rb') as f:
	reader = PyPDF2.PdfReader(f)
	page_count = len(reader.pages)

	if self.extract_metadata:
	metadata = reader.metadata or {}

	for page in reader.pages:
	page_text = page.extract_text()

	if page_text:
	text += page_text + "\n\n"

	if not text.strip():
	warnings.append("PDF appears to be image-based or encrypted")

	return ExtractedDocument(text = text.strip(),
	file_path = str(file_path),
	file_type = '.pdf',
	file_size_bytes = file_path.stat().st_size,
	page_count = page_count,
	extraction_method = 'PyPDF2',
	metadata = metadata,
	is_success = bool(text.strip()),
	error_message = None if text.strip() else "No text extracted",
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = str(file_path),
	error = repr(e),
	)


	def _extract_pdf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
	"""
	Extract text from PDF bytes
	"""
	if not PDF_AVAILABLE:
	return self._create_error_result(file_path = filename,
	error = "PDF libraries not installed",
	)

	warnings = list()
	text = ""
	page_count = 0
	metadata = dict()

	try:
	# Try pdfplumber
	if self.prefer_pdfplumber:
	try:
	with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
	page_count = len(pdf.pages)
	metadata = pdf.metadata or {}

	for page in pdf.pages:
	page_text = page.extract_text()

	if page_text:
	text += page_text + "\n\n"

	if text.strip():
	return ExtractedDocument(text = text.strip(),
	file_path = filename,
	file_type = '.pdf',
	file_size_bytes = len(file_bytes),
	page_count = page_count,
	extraction_method = 'pdfplumber',
	metadata = metadata,
	is_success = True,
	error_message = None,
	warnings = warnings,
	)
	except Exception as e:
	warnings.append(f"pdfplumber failed: {repr(e)}, trying PyPDF2")

	# Fallback to PyPDF2
	reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
	page_count = len(reader.pages)

	for page in reader.pages:
	page_text = page.extract_text()

	if page_text:
	text += page_text + "\n\n"

	return ExtractedDocument(text = text.strip(),
	file_path = filename,
	file_type = '.pdf',
	file_size_bytes = len(file_bytes),
	page_count = page_count,
	extraction_method = 'PyPDF2',
	metadata = metadata,
	is_success = bool(text.strip()),
	error_message = None if text.strip() else "No text extracted",
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _extract_word(self, file_path: Path) -> ExtractedDocument:
	"""
	Extract text from Word documents
	"""
	if not DOCX_AVAILABLE:
	return self._create_error_result(file_path = str(file_path),
	error = "python-docx not installed",
	)

	try:
	doc = DocxDocument(file_path)

	# Extract text from paragraphs
	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	text = "\n\n".join(paragraphs)

	# Extract metadata
	metadata = dict()

	if self.extract_metadata:
	core_props = doc.core_properties
	metadata = {'author' : core_props.author,
	'title' : core_props.title,
	'subject' : core_props.subject,
	'created' : str(core_props.created) if core_props.created else None,
	'modified' : str(core_props.modified) if core_props.modified else None,
	}

	return ExtractedDocument(text = text,
	file_path = str(file_path),
	file_type = file_path.suffix,
	file_size_bytes = file_path.stat().st_size,
	page_count = len(paragraphs), # Approximate
	extraction_method = 'python-docx',
	metadata = metadata,
	is_success = True,
	error_message = None,
	warnings = [],
	)

	except Exception as e:
	return self._create_error_result(file_path = str(file_path),
	error = repr(e),
	)


	def _extract_word_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
	"""
	Extract text from Word document bytes
	"""
	if not DOCX_AVAILABLE:
	return self._create_error_result(file_path = filename,
	error = "python-docx not installed",
	)

	try:
	doc = DocxDocument(io.BytesIO(file_bytes))

	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	text = "\n\n".join(paragraphs)

	metadata = dict()

	if self.extract_metadata:
	core_props = doc.core_properties
	metadata = {'author' : core_props.author,
	'title' : core_props.title,
	}

	return ExtractedDocument(text = text,
	file_path = filename,
	file_type = Path(filename).suffix,
	file_size_bytes = len(file_bytes),
	page_count = len(paragraphs),
	extraction_method = 'python-docx',
	metadata = metadata,
	is_success = True,
	error_message = None,
	warnings = [],
	)

	except Exception as e:
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _extract_rtf(self, file_path: Path) -> ExtractedDocument:
	"""
	Extract text from RTF files (basic implementation)
	"""
	warnings = ["RTF extraction is basic, formatting may be lost"]

	try:
	with open(file_path, 'r', encoding='latin-1') as f:
	content = f.read()

	# Very basic RTF stripping (remove control words)
	text = re.sub(r'\\[a-z]+\d*\s?', '', content)
	text = re.sub(r'[{}]', '', text)
	text = text.strip()

	return ExtractedDocument(text = text,
	file_path = str(file_path),
	file_type = '.rtf',
	file_size_bytes = file_path.stat().st_size,
	page_count = 1,
	extraction_method = 'basic_rtf',
	metadata = {},
	is_success = True,
	error_message = None,
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = str(file_path),
	error = repr(e),
	)


	def _extract_rtf_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
	"""
	Extract text from RTF bytes
	"""
	warnings = ["RTF extraction is basic, formatting may be lost"]

	try:
	content = file_bytes.decode('latin-1')

	# Basic RTF stripping
	text = re.sub(r'\\[a-z]+\d*\s?', '', content)
	text = re.sub(r'[{}]', '', text)
	text = text.strip()

	return ExtractedDocument(text = text,
	file_path = filename,
	file_type = '.rtf',
	file_size_bytes = len(file_bytes),
	page_count = 1,
	extraction_method = 'basic_rtf',
	metadata = {},
	is_success = True,
	error_message = None,
	warnings = warnings,
	)

	except Exception as e:
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _extract_html(self, file_path: Path) -> ExtractedDocument:
	"""
	Extract text from HTML files
	"""
	if not BS4_AVAILABLE:
	return self._create_error_result(file_path = str(file_path),
	error = "BeautifulSoup not installed",
	)

	try:
	with open(file_path, 'r', encoding = 'utf-8', errors = 'ignore') as f:
	content = f.read()

	soup = BeautifulSoup(content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Get text
	text = soup.get_text(separator='\n')

	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	text = '\n'.join(line for line in lines if line)

	return ExtractedDocument(text = text,
	file_path = str(file_path),
	file_type = file_path.suffix,
	file_size_bytes = file_path.stat().st_size,
	page_count = 1,
	extraction_method = 'beautifulsoup',
	metadata = {},
	is_success = True,
	error_message = None,
	warnings = [],
	)

	except Exception as e:
	return self._create_error_result(file_path = str(file_path),
	error = repr(e),
	)


	def _extract_html_bytes(self, file_bytes: bytes, filename: str) -> ExtractedDocument:
	"""
	Extract text from HTML bytes
	"""
	if not BS4_AVAILABLE:
	return self._create_error_result(file_path = filename,
	error = "BeautifulSoup not installed",
	)

	try:
	content = file_bytes.decode('utf-8', errors = 'ignore')

	soup = BeautifulSoup(content, 'html.parser')

	for script in soup(["script", "style"]):
	script.decompose()

	text = soup.get_text(separator='\n')
	lines = (line.strip() for line in text.splitlines())
	text = '\n'.join(line for line in lines if line)

	return ExtractedDocument(text = text,
	file_path = filename,
	file_type = Path(filename).suffix,
	file_size_bytes = len(file_bytes),
	page_count = 1,
	extraction_method = 'beautifulsoup',
	metadata = {},
	is_success = True,
	error_message = None,
	warnings = [],
	)

	except Exception as e:
	return self._create_error_result(file_path = filename,
	error = repr(e),
	)


	def _validate_file(self, file_path: Path) -> Tuple[bool, Optional[str]]:
	"""
	Validate file before extraction
	"""
	# Check if file exists
	if not file_path.exists():
	return False, f"File not found: {file_path}"

	# Check if it's a file
	if not file_path.is_file():
	return False, f"Not a file: {file_path}"

	# Check file size
	file_size = file_path.stat().st_size
	if (file_size > self.max_file_size):
	return False, f"File too large: {file_size/1024/1024:.1f}MB (max: {self.max_file_size/1024/1024:.1f}MB)"

	# Check file extension
	if (file_path.suffix.lower() not in self.SUPPORTED_EXTENSIONS):
	return False, f"Unsupported file type: {file_path.suffix}"

	return True, None


	def _create_error_result(self, file_path: Optional[str], error: str) -> ExtractedDocument:
	"""
	Create error result
	"""
	return ExtractedDocument(text = "",
	file_path = file_path,
	file_type = Path(file_path).suffix if file_path else "unknown",
	file_size_bytes = 0,
	page_count = 0,
	extraction_method = "failed",
	metadata = {},
	is_success = False,
	error_message = error,
	warnings = [],
	)


	# Convenience Functions

	def extract_text(file_path: str, **kwargs) -> ExtractedDocument:
	"""
	Quick text extraction with default settings

	Arguments:
	----------
	file_path : Path to document
	**kwargs : Override settings

	Returns:
	--------
	ExtractedDocument object
	"""
	extractor = DocumentExtractor(**kwargs)
	return extractor.extract(file_path)


	def extract_from_upload(file_bytes: bytes, filename: str, **kwargs) -> ExtractedDocument:
	"""
	Extract text from uploaded file

	Arguments:
	----------
	file_bytes : File content as bytes
	filename : Original filename
	**kwargs : Override settings

	Returns:
	--------
	ExtractedDocument object
	"""
	extractor = DocumentExtractor(**kwargs)
	return extractor.extract_from_bytes(file_bytes, filename)


	# Export
	__all__ = ['DocumentExtractor',
	'ExtractedDocument',
	'extract_text',
	'extract_from_upload',
	]


	# Testing
	if __name__ == "__main__":
	import sys

	if len(sys.argv) > 1:
	# Test with provided file
	test_file = sys.argv[1]
	print(f"Testing extraction on: {test_file}")
	print("=" * 70)

	result = extract_text(test_file)

	print(f"Success: {result.is_success}")
	print(f"File type: {result.file_type}")
	print(f"Pages: {result.page_count}")
	print(f"Method: {result.extraction_method}")
	print(f"Text length: {len(result.text)} chars")

	if result.warnings:
	print(f"Warnings: {result.warnings}")

	if result.error_message:
	print(f"Error: {result.error_message}")

	if result.text:
	print(f"\nFirst 500 chars:")
	print("-" * 70)
	print(result.text[:500])
	else:
	print("Usage: python document_extractor.py <file_path>")
	print("\nSupported formats:")
	for ext in sorted(DocumentExtractor.SUPPORTED_EXTENSIONS):
	print(f" {ext}")