Spaces:
Running
Running
| """Image loader for scraped medical practice images.""" | |
| from pathlib import Path | |
| from PIL import Image | |
| # Default scraped images directory (now under top-level data/scrapedimages) | |
| DEFAULT_SCRAPED_IMAGES_DIR = Path(__file__).parent.parent.parent / "data" / "scrapedimages" | |
| class ImageLoader: | |
| """Load and manage scraped medical practice images. | |
| Attributes: | |
| base_path: Root directory containing scraped images organized by practice | |
| practices: List of available practice directories | |
| """ | |
| def __init__(self, base_path: Path | str | None = None): | |
| """Initialize the ImageLoader. | |
| Args: | |
| base_path: Root directory containing scraped images. | |
| Defaults to project's scrapedimages folder. | |
| """ | |
| if base_path is None: | |
| self.base_path = DEFAULT_SCRAPED_IMAGES_DIR | |
| else: | |
| self.base_path = Path(base_path) | |
| if not self.base_path.exists(): | |
| raise ValueError(f"Image directory does not exist: {self.base_path}") | |
| self._practices: list[str] | None = None | |
| def practices(self) -> list[str]: | |
| """Get list of available practice directories. | |
| Returns: | |
| List of practice directory names (e.g., ['drleedy.com', 'drbirely.com']) | |
| """ | |
| if self._practices is None: | |
| self._practices = sorted( | |
| [ | |
| d.name | |
| for d in self.base_path.iterdir() | |
| if d.is_dir() and not d.name.startswith(".") | |
| ] | |
| ) | |
| return self._practices | |
| def get_practice_path(self, practice_name: str) -> Path: | |
| """Get the full path to a practice directory. | |
| Args: | |
| practice_name: Name of the practice (e.g., 'drleedy.com') | |
| Returns: | |
| Path object pointing to the practice directory | |
| Raises: | |
| ValueError: If practice does not exist | |
| """ | |
| practice_path = self.base_path / practice_name | |
| if not practice_path.exists(): | |
| raise ValueError( | |
| f"Practice '{practice_name}' not found. " | |
| f"Available practices: {', '.join(self.practices)}" | |
| ) | |
| return practice_path | |
| def list_images(self, practice_name: str, extensions: list[str] | None = None) -> list[Path]: | |
| """List all images for a given practice. | |
| Args: | |
| practice_name: Name of the practice | |
| extensions: List of file extensions to filter (e.g., ['.jpg', '.png']) | |
| If None, includes common image formats | |
| Returns: | |
| List of Path objects for all matching images | |
| """ | |
| if extensions is None: | |
| extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"] | |
| practice_path = self.get_practice_path(practice_name) | |
| images: list[Path] = [] | |
| for ext in extensions: | |
| images.extend(practice_path.glob(f"**/*{ext}")) | |
| images.extend(practice_path.glob(f"**/*{ext.upper()}")) | |
| return sorted(images) | |
| def count_images(self, practice_name: str) -> int: | |
| """Count total images for a practice. | |
| Args: | |
| practice_name: Name of the practice | |
| Returns: | |
| Number of images | |
| """ | |
| return len(self.list_images(practice_name)) | |
| def load_image(self, image_path: Path | str) -> Image.Image: | |
| """Load a single image. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| PIL Image object | |
| Raises: | |
| FileNotFoundError: If image does not exist | |
| """ | |
| image_path = Path(image_path) | |
| if not image_path.exists(): | |
| raise FileNotFoundError(f"Image not found: {image_path}") | |
| return Image.open(image_path) | |
| def get_image_info(self, image_path: Path | str) -> dict: | |
| """Get metadata about an image. | |
| Args: | |
| image_path: Path to the image file | |
| Returns: | |
| Dictionary with image metadata (size, format, mode, etc.) | |
| """ | |
| image_path = Path(image_path) | |
| img = self.load_image(image_path) | |
| return { | |
| "path": str(image_path), | |
| "filename": image_path.name, | |
| "practice": ( | |
| image_path.parent.name if image_path.is_relative_to(self.base_path) else None | |
| ), | |
| "size": img.size, | |
| "width": img.width, | |
| "height": img.height, | |
| "format": img.format, | |
| "mode": img.mode, | |
| "file_size_bytes": image_path.stat().st_size, | |
| } | |
| def get_random_images( | |
| self, practice_name: str, n: int = 5, seed: int | None = None | |
| ) -> list[Path]: | |
| """Get random sample of images from a practice. | |
| Args: | |
| practice_name: Name of the practice | |
| n: Number of images to return | |
| seed: Random seed for reproducibility | |
| Returns: | |
| List of n random image paths | |
| """ | |
| import random | |
| images = self.list_images(practice_name) | |
| if seed is not None: | |
| random.seed(seed) | |
| return random.sample(images, min(n, len(images))) | |
| def get_practice_stats(self, practice_name: str) -> dict: | |
| """Get statistics for a practice's images. | |
| Args: | |
| practice_name: Name of the practice | |
| Returns: | |
| Dictionary with practice statistics | |
| """ | |
| images = self.list_images(practice_name) | |
| total_size = sum(img.stat().st_size for img in images) | |
| # Get format distribution | |
| formats: dict[str, int] = {} | |
| for img_path in images: | |
| ext = img_path.suffix.lower() | |
| formats[ext] = formats.get(ext, 0) + 1 | |
| return { | |
| "practice": practice_name, | |
| "total_images": len(images), | |
| "total_size_mb": total_size / (1024 * 1024), | |
| "formats": formats, | |
| "practice_path": str(self.get_practice_path(practice_name)), | |
| } | |
| def get_all_stats(self) -> dict: | |
| """Get statistics for all practices. | |
| Returns: | |
| Dictionary with overall statistics | |
| """ | |
| all_stats: dict = {"practices": {}, "total_images": 0, "total_size_mb": 0.0} | |
| for practice in self.practices: | |
| practice_stats = self.get_practice_stats(practice) | |
| all_stats["practices"][practice] = practice_stats | |
| all_stats["total_images"] += practice_stats["total_images"] | |
| all_stats["total_size_mb"] += practice_stats["total_size_mb"] | |
| return all_stats | |
| def list_practices(base_path: Path | str | None = None) -> list[str]: | |
| """Convenience function to list all available practices. | |
| Args: | |
| base_path: Root directory containing scraped images. | |
| Defaults to project's scrapedimages folder. | |
| Returns: | |
| List of practice directory names | |
| """ | |
| loader = ImageLoader(base_path) | |
| return loader.practices | |