Source code for trailpack.cli

"""Command-line interface for Trailpack.

Provides commands for data processing, validation, and project management.
"""

from pathlib import Path
from typing import Optional
import sys

import typer
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich import print as rprint

# Create Typer app
[docs] app = typer.Typer( name="trailpack", help="Trailpack - Dataset standardization tool for LCA and sustainability data", add_completion=False, )
# Rich console for pretty output
[docs] console = Console()
@app.command()
[docs] def ui( port: int = typer.Option(8501, "--port", "-p", help="Port to run Streamlit on"), host: str = typer.Option("localhost", "--host", help="Host to bind to"), ): """ Launch the Streamlit UI for interactive data mapping. Opens a browser at localhost:8501 with the full interactive interface for mapping columns to ontologies and exporting data packages. Example: trailpack ui trailpack ui --port 8080 """ import subprocess from pathlib import Path # Get the path to streamlit_app.py app_path = Path(__file__).parent / "ui" / "streamlit_app.py" if not app_path.exists(): console.print(f"[red]Error: Could not find streamlit_app.py at {app_path}[/red]") raise typer.Exit(1) console.print(f"[green]Starting Streamlit UI on {host}:{port}...[/green]") try: subprocess.run( [ "streamlit", "run", str(app_path), f"--server.port={port}", f"--server.address={host}", ], check=True, ) except subprocess.CalledProcessError as e: console.print(f"[red]Error launching Streamlit: {e}[/red]") raise typer.Exit(1) except KeyboardInterrupt: console.print("\n[yellow]Streamlit UI stopped[/yellow]")
@app.command()
[docs] def process( data: Path = typer.Option(..., "--data", "-d", help="Path to data file (Excel, CSV)"), sheet: str = typer.Option(None, "--sheet", "-s", help="Sheet name (for Excel files)"), mapping: Path = typer.Option(..., "--mapping", "-m", help="Path to mapping config JSON"), metadata: Path = typer.Option(..., "--metadata", "-M", help="Path to metadata config JSON"), output: Path = typer.Option(..., "--output", "-o", help="Output Parquet file path"), validate_standard: bool = typer.Option(True, "--validate/--no-validate", help="Validate against Trailpack standard"), ): """ Process data file with configs to create a Frictionless Data Package. Reads data, applies column mappings, validates against standard, and exports to Parquet format with embedded metadata. Example: trailpack process \\ --data inventory.xlsx \\ --sheet "Sheet1" \\ --mapping mapping.json \\ --metadata metadata.json \\ --output clean-data.parquet """ from trailpack.io.smart_reader import SmartDataReader from trailpack.config import load_configs, extract_column_mappings, extract_general_details from trailpack.packing.export_service import DataPackageExporter console.print(Panel.fit( "[bold cyan]Trailpack Process[/bold cyan]\n" f"Data: {data}\n" f"Sheet: {sheet or 'N/A'}\n" f"Mapping: {mapping}\n" f"Metadata: {metadata}\n" f"Output: {output}", title="Configuration" )) try: # 1. Validate inputs if not data.exists(): console.print(f"[red]Error: Data file not found: {data}[/red]") raise typer.Exit(1) # 2. Load configs console.print("[cyan]Loading configurations...[/cyan]") mapping_config, metadata_config = load_configs( mapping_path=mapping, metadata_path=metadata ) # 3. Read data with SmartDataReader console.print(f"[cyan]Reading data file with SmartDataReader...[/cyan]") reader = SmartDataReader(data) console.print(f" Engine: {reader.engine}") console.print(f" Estimated memory: {reader.estimate_memory()}") df = reader.read(sheet_name=sheet) console.print(f" Loaded {len(df)} rows, {len(df.columns)} columns") # 4. Extract mappings and metadata column_mappings = extract_column_mappings(mapping_config) general_details = extract_general_details(metadata_config) console.print(f"[cyan]Applying {len(column_mappings)} column mappings...[/cyan]") # 5. Create exporter and export console.print("[cyan]Creating data package...[/cyan]") exporter = DataPackageExporter( dataframe=df, column_mappings=column_mappings, general_details=general_details, language="en" ) output_path, quality_level, validation_result = exporter.export( str(output), validate_standard=validate_standard ) # 6. Display results console.print(f"[green]✓ Data package created: {output_path}[/green]") if quality_level: quality_colors = { "STRICT": "green", "STANDARD": "cyan", "BASIC": "yellow", "INVALID": "red" } color = quality_colors.get(quality_level, "white") console.print(f" Quality Level: [{color}]{quality_level}[/{color}]") if validation_result: if validation_result.errors: console.print(f" [red]Errors: {len(validation_result.errors)}[/red]") if validation_result.warnings: console.print(f" [yellow]Warnings: {len(validation_result.warnings)}[/yellow]") if validation_result.info: console.print(f" [cyan]Info: {len(validation_result.info)}[/cyan]") console.print("[green]✓ Process completed successfully[/green]") except Exception as e: console.print(f"[red]Error: {e}[/red]") raise typer.Exit(1)
@app.command()
[docs] def validate( data: Path = typer.Option(..., "--data", "-d", help="Path to data file (Excel, CSV)"), sheet: str = typer.Option(None, "--sheet", "-s", help="Sheet name (for Excel files)"), mapping: Path = typer.Option(..., "--mapping", "-m", help="Path to mapping config JSON"), metadata: Path = typer.Option(..., "--metadata", "-M", help="Path to metadata config JSON"), ): """ Validate data and configs without creating output (dry-run). Checks data quality, type consistency, and standard compliance without writing any files. Example: trailpack validate \\ --data inventory.xlsx \\ --mapping mapping.json \\ --metadata metadata.json """ from trailpack.io.smart_reader import SmartDataReader from trailpack.config import load_configs, extract_column_mappings, extract_general_details from trailpack.packing.export_service import DataPackageExporter from trailpack.packing.metadata_builder import MetadataBuilder from trailpack.validation.standard_validator import StandardValidator console.print(Panel.fit( "[bold cyan]Trailpack Validate[/bold cyan]\n" f"Data: {data}\n" f"Sheet: {sheet or 'N/A'}\n" f"Mapping: {mapping}\n" f"Metadata: {metadata}", title="Validation Configuration" )) try: # 1. Validate inputs if not data.exists(): console.print(f"[red]Error: Data file not found: {data}[/red]") raise typer.Exit(1) # 2. Load configs console.print("[cyan]Loading configurations...[/cyan]") mapping_config, metadata_config = load_configs( mapping_path=mapping, metadata_path=metadata ) # 3. Read data console.print(f"[cyan]Reading data file...[/cyan]") reader = SmartDataReader(data) console.print(f" Engine: {reader.engine}") df = reader.read(sheet_name=sheet) console.print(f" Loaded {len(df)} rows, {len(df.columns)} columns") # 4. Extract mappings and metadata column_mappings = extract_column_mappings(mapping_config) general_details = extract_general_details(metadata_config) # 5. Build metadata console.print("[cyan]Building metadata...[/cyan]") metadata_builder = MetadataBuilder( dataframe=df, column_mappings=column_mappings, general_details=general_details, language="en" ) package_metadata = metadata_builder.build() # 6. Validate console.print("[cyan]Validating against Trailpack standard...[/cyan]") validator = StandardValidator() validation_result = validator.validate(df, package_metadata) quality_level = validator.determine_quality_level(validation_result) # 7. Display results console.print("\n" + "=" * 60) console.print("[bold]Validation Results[/bold]") console.print("=" * 60 + "\n") # Quality level quality_colors = { "STRICT": "green", "STANDARD": "cyan", "BASIC": "yellow", "INVALID": "red" } color = quality_colors.get(quality_level, "white") console.print(f"Quality Level: [{color}bold]{quality_level}[/{color}bold]\n") # Errors if validation_result.errors: console.print(f"[red bold]Errors ({len(validation_result.errors)}):[/red bold]") for error in validation_result.errors: console.print(f" [red]✗[/red] {error}") console.print() else: console.print("[green]✓ No errors[/green]\n") # Warnings if validation_result.warnings: console.print(f"[yellow bold]Warnings ({len(validation_result.warnings)}):[/yellow bold]") for warning in validation_result.warnings: console.print(f" [yellow]⚠[/yellow] {warning}") console.print() else: console.print("[green]✓ No warnings[/green]\n") # Info if validation_result.info: console.print(f"[cyan bold]Info ({len(validation_result.info)}):[/cyan bold]") for info in validation_result.info: console.print(f" [cyan]ℹ[/cyan] {info}") console.print() # Summary if quality_level == "INVALID": console.print("[red]✗ Validation failed - please fix errors before export[/red]") raise typer.Exit(1) else: console.print("[green]✓ Validation passed - ready for export[/green]") except Exception as e: console.print(f"[red]Error: {e}[/red]") raise typer.Exit(1)
@app.command()
[docs] def check( parquet_file: Path = typer.Argument(..., help="Path to Parquet file to check"), ): """ Check existing Parquet file for standard compliance. Reads the Parquet file, extracts metadata, validates against Trailpack standard, and displays quality level. Example: trailpack check my-dataset.parquet """ import pandas as pd from trailpack.validation.standard_validator import StandardValidator console.print(Panel.fit( f"[bold cyan]Checking:[/bold cyan] {parquet_file}", title="Trailpack Check" )) if not parquet_file.exists(): console.print(f"[red]Error: File not found: {parquet_file}[/red]") raise typer.Exit(1) try: # 1. Read Parquet file console.print("[cyan]Reading Parquet file...[/cyan]") df = pd.read_parquet(parquet_file) console.print(f" Loaded {len(df)} rows, {len(df.columns)} columns") # 2. Extract metadata from Parquet console.print("[cyan]Extracting metadata...[/cyan]") parquet_file_obj = pd.io.parquet.read_parquet(parquet_file, engine='pyarrow') # Try to get metadata from Parquet schema import pyarrow.parquet as pq parquet_table = pq.read_table(parquet_file) # Extract custom metadata if available package_metadata = {} if parquet_table.schema.metadata: metadata_bytes = parquet_table.schema.metadata.get(b'trailpack_metadata') if metadata_bytes: import json package_metadata = json.loads(metadata_bytes.decode('utf-8')) console.print(f" Found Trailpack metadata") else: console.print("[yellow] Warning: No Trailpack metadata found in file[/yellow]") else: console.print("[yellow] Warning: No metadata found in Parquet file[/yellow]") # 3. Validate console.print("[cyan]Validating against Trailpack standard...[/cyan]") validator = StandardValidator() validation_result = validator.validate(df, package_metadata) quality_level = validator.determine_quality_level(validation_result) # 4. Display results console.print("\n" + "=" * 60) console.print("[bold]Validation Results[/bold]") console.print("=" * 60 + "\n") # File info file_size = parquet_file.stat().st_size console.print(f"File: {parquet_file}") console.print(f"Size: {file_size / (1024*1024):.2f} MB") console.print(f"Rows: {len(df)}, Columns: {len(df.columns)}\n") # Quality level quality_colors = { "STRICT": "green", "STANDARD": "cyan", "BASIC": "yellow", "INVALID": "red" } color = quality_colors.get(quality_level, "white") console.print(f"Quality Level: [{color}bold]{quality_level}[/{color}bold]\n") # Errors if validation_result.errors: console.print(f"[red bold]Errors ({len(validation_result.errors)}):[/red bold]") for error in validation_result.errors: console.print(f" [red]✗[/red] {error}") console.print() else: console.print("[green]✓ No errors[/green]\n") # Warnings if validation_result.warnings: console.print(f"[yellow bold]Warnings ({len(validation_result.warnings)}):[/yellow bold]") for warning in validation_result.warnings: console.print(f" [yellow]⚠[/yellow] {warning}") console.print() else: console.print("[green]✓ No warnings[/green]\n") # Info if validation_result.info: console.print(f"[cyan bold]Info ({len(validation_result.info)}):[/cyan bold]") for info in validation_result.info: console.print(f" [cyan]ℹ[/cyan] {info}") console.print() # Summary if quality_level == "INVALID": console.print("[red]✗ File does not meet Trailpack standards[/red]") raise typer.Exit(1) else: console.print("[green]✓ File meets Trailpack standards[/green]") except Exception as e: console.print(f"[red]Error: {e}[/red]") raise typer.Exit(1)
@app.command()
[docs] def init( project_name: str = typer.Argument(..., help="Name of the project to initialize"), directory: Optional[Path] = typer.Option(None, "--dir", "-d", help="Directory to create project in (default: current)"), ): """ Initialize a new Trailpack project structure. Creates a directory with example configs, data folder, and README with instructions. Example: trailpack init my-dataset trailpack init my-dataset --dir ~/projects """ if directory is None: directory = Path.cwd() project_path = directory / project_name console.print(Panel.fit( f"[bold cyan]Creating project:[/bold cyan] {project_name}\n" f"[bold cyan]Location:[/bold cyan] {project_path}", title="Trailpack Init" )) try: # 1. Create directory structure if project_path.exists(): console.print(f"[red]Error: Directory already exists: {project_path}[/red]") raise typer.Exit(1) console.print("[cyan]Creating directory structure...[/cyan]") project_path.mkdir(parents=True) (project_path / "data").mkdir() (project_path / "configs").mkdir() (project_path / "output").mkdir() # 2. Create README console.print("[cyan]Creating README.md...[/cyan]") readme_content = f"""# {project_name} A Trailpack dataset standardization project. ## Directory Structure - `data/` - Place your raw data files here (Excel, CSV) - `configs/` - Store your mapping and metadata configuration files - `output/` - Generated Parquet data packages will be saved here ## Quick Start ### 1. Add Your Data Place your data file in the `data/` directory: ```bash cp your-data.xlsx data/ ``` ### 2. Create Configs with UI Launch the interactive UI to map your data: ```bash trailpack ui ``` - Upload your data file - Map columns to ontology terms - Add metadata - Download configuration files to `configs/` ### 3. Process Data Use the CLI to process your data with the configs: ```bash trailpack process \\ --data data/your-data.xlsx \\ --sheet "Sheet1" \\ --mapping configs/mapping_config.json \\ --metadata configs/metadata_config.json \\ --output output/clean-data.parquet ``` ### 4. Validate (Optional) Validate your data without creating output: ```bash trailpack validate \\ --data data/your-data.xlsx \\ --mapping configs/mapping_config.json \\ --metadata configs/metadata_config.json ``` ### 5. Check Output Check an existing Parquet file: ```bash trailpack check output/clean-data.parquet ``` ## Configuration Files ### Mapping Config (`mapping_config.json`) Maps your columns to ontology terms: ```json {{ "version": "1.0.0", "config_type": "mapping", "language": "en", "column_mappings": {{ "Product": "https://vocab.sentier.dev/products/product/Product", "CO2_emissions": "https://vocab.sentier.dev/model-terms/generic-terms/Emission" }} }} ``` ### Metadata Config (`metadata_config.json`) Defines package metadata: ```json {{ "version": "1.0.0", "config_type": "metadata", "package": {{ "name": "{project_name}", "title": "Your Dataset Title", "description": "Description of your dataset" }} }} ``` ## Learn More - [Trailpack Documentation](https://github.com/TimoDiepers/trailpaack) - [Frictionless Data Standard](https://specs.frictionlessdata.io/) """ readme_path = project_path / "README.md" readme_path.write_text(readme_content) # 3. Create example mapping config console.print("[cyan]Creating example configs...[/cyan]") example_mapping = { "version": "1.0.0", "config_type": "mapping", "language": "en", "file_info": { "original_file": "your-data.xlsx", "sheet_name": "Sheet1" }, "column_mappings": { "product_name": "https://vocab.sentier.dev/products/product/Product", "emissions": "https://vocab.sentier.dev/model-terms/generic-terms/Emission" } } import json mapping_path = project_path / "configs" / "example_mapping_config.json" mapping_path.write_text(json.dumps(example_mapping, indent=2, ensure_ascii=False)) # 4. Create example metadata config example_metadata = { "version": "1.0.0", "config_type": "metadata", "package": { "name": project_name, "title": f"{project_name.replace('-', ' ').title()} Dataset", "description": "Add your dataset description here", "version": "0.1.0", "keywords": ["lca", "sustainability"], "homepage": "https://example.com" }, "licenses": [ { "name": "CC-BY-4.0", "path": "https://creativecommons.org/licenses/by/4.0/", "title": "Creative Commons Attribution 4.0" } ], "contributors": [ { "title": "Your Name", "role": "author" } ] } metadata_path = project_path / "configs" / "example_metadata_config.json" metadata_path.write_text(json.dumps(example_metadata, indent=2, ensure_ascii=False)) # 5. Create .gitignore gitignore_content = """# Data files data/*.xlsx data/*.xls data/*.csv # Output files output/*.parquet # Python __pycache__/ *.py[cod] *$py.class .venv/ venv/ # IDE .vscode/ .idea/ *.swp """ gitignore_path = project_path / ".gitignore" gitignore_path.write_text(gitignore_content) # Success message console.print(f"\n[green]✓ Project created successfully![/green]") console.print(f"\n[cyan]Next steps:[/cyan]") console.print(f" 1. cd {project_path}") console.print(f" 2. Place your data in data/") console.print(f" 3. Run: trailpack ui") console.print(f"\n[cyan]Files created:[/cyan]") console.print(f" ✓ README.md") console.print(f" ✓ configs/example_mapping_config.json") console.print(f" ✓ configs/example_metadata_config.json") console.print(f" ✓ .gitignore") console.print(f" ✓ data/ (empty)") console.print(f" ✓ output/ (empty)") except Exception as e: console.print(f"[red]Error: {e}[/red]") raise typer.Exit(1)
[docs] def version_callback(value: bool): """Print version and exit.""" if value: from trailpack import __version__ console.print(f"[cyan]Trailpack version:[/cyan] {__version__}") raise typer.Exit()
@app.callback()
[docs] def main( version: bool = typer.Option( None, "--version", "-v", callback=version_callback, is_eager=True, help="Show version and exit", ), ): """ Trailpack - Dataset standardization tool. Interactive UI for mapping data to ontologies and creating Frictionless Data Packages for LCA and sustainability research. """ pass
if __name__ == "__main__": app()