"""Image loader for scraped medical practice images.""" from pathlib import Path from PIL import Image # Default scraped images directory (now under top-level data/scrapedimages) DEFAULT_SCRAPED_IMAGES_DIR = Path(__file__).parent.parent.parent / "data" / "scrapedimages" class ImageLoader: """Load and manage scraped medical practice images. Attributes: base_path: Root directory containing scraped images organized by practice practices: List of available practice directories """ def __init__(self, base_path: Path | str | None = None): """Initialize the ImageLoader. Args: base_path: Root directory containing scraped images. Defaults to project's scrapedimages folder. """ if base_path is None: self.base_path = DEFAULT_SCRAPED_IMAGES_DIR else: self.base_path = Path(base_path) if not self.base_path.exists(): raise ValueError(f"Image directory does not exist: {self.base_path}") self._practices: list[str] | None = None @property def practices(self) -> list[str]: """Get list of available practice directories. Returns: List of practice directory names (e.g., ['drleedy.com', 'drbirely.com']) """ if self._practices is None: self._practices = sorted( [ d.name for d in self.base_path.iterdir() if d.is_dir() and not d.name.startswith(".") ] ) return self._practices def get_practice_path(self, practice_name: str) -> Path: """Get the full path to a practice directory. Args: practice_name: Name of the practice (e.g., 'drleedy.com') Returns: Path object pointing to the practice directory Raises: ValueError: If practice does not exist """ practice_path = self.base_path / practice_name if not practice_path.exists(): raise ValueError( f"Practice '{practice_name}' not found. " f"Available practices: {', '.join(self.practices)}" ) return practice_path def list_images(self, practice_name: str, extensions: list[str] | None = None) -> list[Path]: """List all images for a given practice. Args: practice_name: Name of the practice extensions: List of file extensions to filter (e.g., ['.jpg', '.png']) If None, includes common image formats Returns: List of Path objects for all matching images """ if extensions is None: extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"] practice_path = self.get_practice_path(practice_name) images: list[Path] = [] for ext in extensions: images.extend(practice_path.glob(f"**/*{ext}")) images.extend(practice_path.glob(f"**/*{ext.upper()}")) return sorted(images) def count_images(self, practice_name: str) -> int: """Count total images for a practice. Args: practice_name: Name of the practice Returns: Number of images """ return len(self.list_images(practice_name)) def load_image(self, image_path: Path | str) -> Image.Image: """Load a single image. Args: image_path: Path to the image file Returns: PIL Image object Raises: FileNotFoundError: If image does not exist """ image_path = Path(image_path) if not image_path.exists(): raise FileNotFoundError(f"Image not found: {image_path}") return Image.open(image_path) def get_image_info(self, image_path: Path | str) -> dict: """Get metadata about an image. Args: image_path: Path to the image file Returns: Dictionary with image metadata (size, format, mode, etc.) """ image_path = Path(image_path) img = self.load_image(image_path) return { "path": str(image_path), "filename": image_path.name, "practice": ( image_path.parent.name if image_path.is_relative_to(self.base_path) else None ), "size": img.size, "width": img.width, "height": img.height, "format": img.format, "mode": img.mode, "file_size_bytes": image_path.stat().st_size, } def get_random_images( self, practice_name: str, n: int = 5, seed: int | None = None ) -> list[Path]: """Get random sample of images from a practice. Args: practice_name: Name of the practice n: Number of images to return seed: Random seed for reproducibility Returns: List of n random image paths """ import random images = self.list_images(practice_name) if seed is not None: random.seed(seed) return random.sample(images, min(n, len(images))) def get_practice_stats(self, practice_name: str) -> dict: """Get statistics for a practice's images. Args: practice_name: Name of the practice Returns: Dictionary with practice statistics """ images = self.list_images(practice_name) total_size = sum(img.stat().st_size for img in images) # Get format distribution formats: dict[str, int] = {} for img_path in images: ext = img_path.suffix.lower() formats[ext] = formats.get(ext, 0) + 1 return { "practice": practice_name, "total_images": len(images), "total_size_mb": total_size / (1024 * 1024), "formats": formats, "practice_path": str(self.get_practice_path(practice_name)), } def get_all_stats(self) -> dict: """Get statistics for all practices. Returns: Dictionary with overall statistics """ all_stats: dict = {"practices": {}, "total_images": 0, "total_size_mb": 0.0} for practice in self.practices: practice_stats = self.get_practice_stats(practice) all_stats["practices"][practice] = practice_stats all_stats["total_images"] += practice_stats["total_images"] all_stats["total_size_mb"] += practice_stats["total_size_mb"] return all_stats def list_practices(base_path: Path | str | None = None) -> list[str]: """Convenience function to list all available practices. Args: base_path: Root directory containing scraped images. Defaults to project's scrapedimages folder. Returns: List of practice directory names """ loader = ImageLoader(base_path) return loader.practices