#!/usr/bin/env python3 """Create compressed archives of catalog and cache data. This script packages the catalog and cached_results directories into tar.gz archives for efficient storage in Git LFS. Usage: # Create both archives python scripts/create_archives.py # Create only catalog archive python scripts/create_archives.py --catalog-only # Create only cache archive python scripts/create_archives.py --cache-only # Show archive info without creating python scripts/create_archives.py --info """ import argparse import logging import tarfile from pathlib import Path # Configure logging logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s", ) logger = logging.getLogger(__name__) # Paths PROJECT_ROOT = Path(__file__).parent.parent CATALOG_DIR = PROJECT_ROOT / "data" / "catalog" CACHE_DIR = PROJECT_ROOT / "cached_results" CATALOG_ARCHIVE = PROJECT_ROOT / "data" / "catalog.tar.gz" CACHE_ARCHIVE = PROJECT_ROOT / "data" / "cache.tar.gz" def get_dir_size(path: Path) -> int: """Get total size of directory in bytes.""" return sum(f.stat().st_size for f in path.rglob("*") if f.is_file()) def get_file_count(path: Path) -> int: """Get total number of files in directory.""" return sum(1 for f in path.rglob("*") if f.is_file()) def format_size(size_bytes: int) -> str: """Format size in human-readable format.""" for unit in ["B", "KB", "MB", "GB"]: if size_bytes < 1024: return f"{size_bytes:.1f} {unit}" size_bytes /= 1024 return f"{size_bytes:.1f} TB" def create_archive(source_dir: Path, archive_path: Path) -> None: """Create a tar.gz archive from a directory. Args: source_dir: Directory to archive archive_path: Output archive path """ if not source_dir.exists(): logger.error(f"Source directory not found: {source_dir}") return source_size = get_dir_size(source_dir) file_count = get_file_count(source_dir) logger.info(f"Archiving {source_dir.name}/") logger.info(f" Source: {format_size(source_size)} ({file_count} files)") # Create archive archive_path.parent.mkdir(parents=True, exist_ok=True) with tarfile.open(archive_path, "w:gz") as tar: # Add directory with its name as the archive root tar.add(source_dir, arcname=source_dir.name) archive_size = archive_path.stat().st_size compression_ratio = (1 - archive_size / source_size) * 100 if source_size > 0 else 0 logger.info(f" Archive: {format_size(archive_size)}") logger.info(f" Compression: {compression_ratio:.1f}% reduction") logger.info(f" Created: {archive_path}") def show_info() -> None: """Show information about directories and existing archives.""" print("\n=== Directory Info ===") for name, path in [("Catalog", CATALOG_DIR), ("Cache", CACHE_DIR)]: if path.exists(): size = get_dir_size(path) count = get_file_count(path) print(f"{name}: {format_size(size)} ({count} files)") else: print(f"{name}: not found") print("\n=== Archive Info ===") for name, path in [("Catalog", CATALOG_ARCHIVE), ("Cache", CACHE_ARCHIVE)]: if path.exists(): size = path.stat().st_size print(f"{name}: {format_size(size)}") else: print(f"{name}: not created") def main(): parser = argparse.ArgumentParser( description="Create compressed archives of catalog and cache data" ) parser.add_argument( "--catalog-only", action="store_true", help="Only create catalog archive", ) parser.add_argument( "--cache-only", action="store_true", help="Only create cache archive", ) parser.add_argument( "--info", action="store_true", help="Show info about directories and archives", ) args = parser.parse_args() if args.info: show_info() return # Determine what to archive do_catalog = not args.cache_only do_cache = not args.catalog_only if do_catalog: create_archive(CATALOG_DIR, CATALOG_ARCHIVE) if do_cache: create_archive(CACHE_DIR, CACHE_ARCHIVE) print("\n=== Summary ===") show_info() if __name__ == "__main__": main()