Source code for trailpack.excel.reader

"""Excel reader module for loading and inspecting Excel files.

This module provides an ExcelReader class that:
- Loads only the structure (sheets and columns) into memory
- Provides access to sheet names
- Provides access to column names for mapping
"""

from pathlib import Path
from typing import Dict, List, Optional, Union
import openpyxl


[docs] class ExcelReader: """ Excel file reader that loads only sheet structure (sheets and columns) into memory. This class provides methods to inspect Excel file structure without loading all the data, making it memory-efficient for large files: - List available sheets - Get column names from specific sheets Example: >>> reader = ExcelReader("data.xlsx") >>> sheet_names = reader.sheets() >>> columns = reader.columns("Sheet1") """ def __init__(self, file_path: Union[str, Path], header_row: int = 1): """ Initialize ExcelReader and load sheet structure (sheets and columns) into memory. Only the sheet names and column headers are loaded, not the actual data. This makes it memory-efficient for large Excel files. Args: file_path: Path to the Excel file (.xlsx, .xlsm, .xltx, .xltm) header_row: Row number containing column headers (1-indexed). Defaults to 1. Raises: FileNotFoundError: If the file does not exist ValueError: If the file is not a valid Excel file """
[docs] self.file_path = Path(file_path)
[docs] self.header_row = header_row
if not self.file_path.exists(): raise FileNotFoundError(f"Excel file not found: {self.file_path}") if self.file_path.suffix not in ['.xlsx', '.xlsm', '.xltx', '.xltm']: raise ValueError(f"File must be an Excel file (.xlsx, .xlsm, .xltx, .xltm), got: {self.file_path.suffix}") # Load only sheet structure (sheets -> columns) into memory
[docs] self._sheet_columns: Dict[str, List[str]] = {}
self._load_structure()
[docs] def _load_structure(self): """ Load sheet structure (sheet names and column headers) from Excel file. Opens the workbook in read-only mode, extracts structure, then closes it. Only loads metadata, not actual data, for memory efficiency. """ try: # Open in read-only mode for efficiency workbook = openpyxl.load_workbook( self.file_path, read_only=True, data_only=True ) # Extract sheet names and columns for sheet_name in workbook.sheetnames: sheet = workbook[sheet_name] # Read header row to get column names columns = [] try: # Get the header row for row in sheet.iter_rows(min_row=self.header_row, max_row=self.header_row): for cell in row: # Convert cell value to string, use empty string for None column_name = str(cell.value) if cell.value is not None else "" columns.append(column_name) break # Only read the header row # Remove trailing empty columns while columns and columns[-1] == "": columns.pop() except Exception as e: # If we can't read columns (e.g., empty sheet), store empty list columns = [] self._sheet_columns[sheet_name] = columns # Close workbook immediately - we only need the structure workbook.close() except Exception as e: raise ValueError(f"Failed to load Excel file structure: {e}")
[docs] def sheets(self) -> List[str]: """ Get list of all sheet names in the workbook. Returns: List of sheet names as strings Example: >>> reader = ExcelReader("data.xlsx") >>> sheet_names = reader.sheets() >>> print(sheet_names) ['Sheet1', 'Sheet2', 'Data'] """ return list(self._sheet_columns.keys())
[docs] def columns(self, sheet_name: Optional[str] = None) -> List[str]: """ Get list of column names from a specific sheet. Args: sheet_name: Name of the sheet to read columns from. If None, uses the first sheet. Returns: List of column names as strings. Empty cells are returned as empty strings. Raises: ValueError: If sheet_name doesn't exist in the workbook Example: >>> reader = ExcelReader("data.xlsx") >>> columns = reader.columns("Sheet1") >>> print(columns) ['ID', 'Name', 'Value', 'Date'] """ # Get sheet name if sheet_name is None: # Use first sheet if no name specified sheet_name = self.sheets()[0] if self.sheets() else None if sheet_name is None: return [] # Check if sheet exists if sheet_name not in self._sheet_columns: raise ValueError( f"Sheet '{sheet_name}' not found. " f"Available sheets: {', '.join(self.sheets())}" ) return self._sheet_columns[sheet_name]
[docs] def get_structure(self) -> Dict[str, List[str]]: """ Get the complete sheet structure as a dictionary. Returns: Dictionary mapping sheet names to their column lists Example: >>> reader = ExcelReader("data.xlsx") >>> structure = reader.get_structure() >>> print(structure) {'Sheet1': ['ID', 'Name', 'Value'], 'Sheet2': ['Date', 'Amount']} """ return self._sheet_columns.copy()
[docs] def reload(self): """ Reload the sheet structure from the Excel file. Useful if the file has been modified and you want to refresh the structure. """ self._sheet_columns.clear() self._load_structure()
def __enter__(self): """Context manager entry.""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - nothing to clean up as workbook is already closed.""" pass def __repr__(self) -> str: """String representation of ExcelReader.""" return f"ExcelReader(file_path='{self.file_path}', sheets={len(self.sheets())})"