a360-upscaler / warp /data /image_loader.py
nicka360's picture
Initial A360 migration
08a5a82
"""Image loader for scraped medical practice images."""
from pathlib import Path
from PIL import Image
# Default scraped images directory (now under top-level data/scrapedimages)
DEFAULT_SCRAPED_IMAGES_DIR = Path(__file__).parent.parent.parent / "data" / "scrapedimages"
class ImageLoader:
"""Load and manage scraped medical practice images.
Attributes:
base_path: Root directory containing scraped images organized by practice
practices: List of available practice directories
"""
def __init__(self, base_path: Path | str | None = None):
"""Initialize the ImageLoader.
Args:
base_path: Root directory containing scraped images.
Defaults to project's scrapedimages folder.
"""
if base_path is None:
self.base_path = DEFAULT_SCRAPED_IMAGES_DIR
else:
self.base_path = Path(base_path)
if not self.base_path.exists():
raise ValueError(f"Image directory does not exist: {self.base_path}")
self._practices: list[str] | None = None
@property
def practices(self) -> list[str]:
"""Get list of available practice directories.
Returns:
List of practice directory names (e.g., ['drleedy.com', 'drbirely.com'])
"""
if self._practices is None:
self._practices = sorted(
[
d.name
for d in self.base_path.iterdir()
if d.is_dir() and not d.name.startswith(".")
]
)
return self._practices
def get_practice_path(self, practice_name: str) -> Path:
"""Get the full path to a practice directory.
Args:
practice_name: Name of the practice (e.g., 'drleedy.com')
Returns:
Path object pointing to the practice directory
Raises:
ValueError: If practice does not exist
"""
practice_path = self.base_path / practice_name
if not practice_path.exists():
raise ValueError(
f"Practice '{practice_name}' not found. "
f"Available practices: {', '.join(self.practices)}"
)
return practice_path
def list_images(self, practice_name: str, extensions: list[str] | None = None) -> list[Path]:
"""List all images for a given practice.
Args:
practice_name: Name of the practice
extensions: List of file extensions to filter (e.g., ['.jpg', '.png'])
If None, includes common image formats
Returns:
List of Path objects for all matching images
"""
if extensions is None:
extensions = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".bmp"]
practice_path = self.get_practice_path(practice_name)
images: list[Path] = []
for ext in extensions:
images.extend(practice_path.glob(f"**/*{ext}"))
images.extend(practice_path.glob(f"**/*{ext.upper()}"))
return sorted(images)
def count_images(self, practice_name: str) -> int:
"""Count total images for a practice.
Args:
practice_name: Name of the practice
Returns:
Number of images
"""
return len(self.list_images(practice_name))
def load_image(self, image_path: Path | str) -> Image.Image:
"""Load a single image.
Args:
image_path: Path to the image file
Returns:
PIL Image object
Raises:
FileNotFoundError: If image does not exist
"""
image_path = Path(image_path)
if not image_path.exists():
raise FileNotFoundError(f"Image not found: {image_path}")
return Image.open(image_path)
def get_image_info(self, image_path: Path | str) -> dict:
"""Get metadata about an image.
Args:
image_path: Path to the image file
Returns:
Dictionary with image metadata (size, format, mode, etc.)
"""
image_path = Path(image_path)
img = self.load_image(image_path)
return {
"path": str(image_path),
"filename": image_path.name,
"practice": (
image_path.parent.name if image_path.is_relative_to(self.base_path) else None
),
"size": img.size,
"width": img.width,
"height": img.height,
"format": img.format,
"mode": img.mode,
"file_size_bytes": image_path.stat().st_size,
}
def get_random_images(
self, practice_name: str, n: int = 5, seed: int | None = None
) -> list[Path]:
"""Get random sample of images from a practice.
Args:
practice_name: Name of the practice
n: Number of images to return
seed: Random seed for reproducibility
Returns:
List of n random image paths
"""
import random
images = self.list_images(practice_name)
if seed is not None:
random.seed(seed)
return random.sample(images, min(n, len(images)))
def get_practice_stats(self, practice_name: str) -> dict:
"""Get statistics for a practice's images.
Args:
practice_name: Name of the practice
Returns:
Dictionary with practice statistics
"""
images = self.list_images(practice_name)
total_size = sum(img.stat().st_size for img in images)
# Get format distribution
formats: dict[str, int] = {}
for img_path in images:
ext = img_path.suffix.lower()
formats[ext] = formats.get(ext, 0) + 1
return {
"practice": practice_name,
"total_images": len(images),
"total_size_mb": total_size / (1024 * 1024),
"formats": formats,
"practice_path": str(self.get_practice_path(practice_name)),
}
def get_all_stats(self) -> dict:
"""Get statistics for all practices.
Returns:
Dictionary with overall statistics
"""
all_stats: dict = {"practices": {}, "total_images": 0, "total_size_mb": 0.0}
for practice in self.practices:
practice_stats = self.get_practice_stats(practice)
all_stats["practices"][practice] = practice_stats
all_stats["total_images"] += practice_stats["total_images"]
all_stats["total_size_mb"] += practice_stats["total_size_mb"]
return all_stats
def list_practices(base_path: Path | str | None = None) -> list[str]:
"""Convenience function to list all available practices.
Args:
base_path: Root directory containing scraped images.
Defaults to project's scrapedimages folder.
Returns:
List of practice directory names
"""
loader = ImageLoader(base_path)
return loader.practices