Source code for trailpack.validation

"""
Validation module for Trailpack.

Provides standards-based validation for data packages to ensure
quality and compliance before submission to repositories.

The validation system checks:
- Metadata completeness: All required fields present (name, title, resources, etc.)
- Data quality metrics: Missing values and duplicates (logged as info)
- Type consistency: Mixed types and schema matching (raises errors)
- Field definitions: Proper types, units for numeric fields

Key Components:
- StandardValidator: Main validation class for all checks
- ValidationResult: Result object with errors, warnings, info, and compliance level
- Standards YAML: Versioned validation rules in standards/v*.yaml

Data Quality vs Type Consistency:
- Data quality metrics (nulls, duplicates) are logged as INFO messages
- Type consistency issues (mixed types, schema mismatches) raise ERRORS
- Only errors cause validation to fail

Unit Requirements:
All numeric fields (type "number" or "integer") must have units specified,
even for dimensionless quantities like IDs and counts. Use the QUDT vocabulary
for unit definitions (http://qudt.org/vocab/unit/).

Inconsistency Tracking and Export:
When type inconsistencies are detected (e.g., mixed types in a column), each
inconsistent value is automatically tracked and exported to 'data_inconsistencies.csv'
when the ValidationResult is printed. This CSV file contains:
- row: Row index of the inconsistent value
- column: Column name
- value: The actual value
- actual_type: Python type of the value
- expected_type: Most common type in the column

This export happens automatically for easy data cleaning workflows.

Example:
    >>> from trailpack.validation import StandardValidator
    >>> validator = StandardValidator("1.0.0")
    >>> result = validator.validate_data_quality(df, schema)
    >>> print(result)  # Automatically exports inconsistencies.csv if issues found
    >>> if result.is_valid:
    ...     print(f"{result.level}")
    ... else:
    ...     for error in result.errors:
    ...         print(f"Error: {error}")
"""

from pathlib import Path

# Version of the validation module
__version__ = "1.0.0"

# Path to standards directory
[docs] STANDARDS_DIR = Path(__file__).parent / "standards"
[docs] def get_standard_path(version: str = "1.0.0") -> Path: """ Get the path to a specific standard version. Args: version: Standard version (default: "1.0.0") Returns: Path to the standard YAML file Raises: FileNotFoundError: If the standard version doesn't exist """ standard_path = STANDARDS_DIR / f"v{version}.yaml" if not standard_path.exists(): raise FileNotFoundError( f"Standard version v{version} not found. " f"Available versions: {list_available_standards()}" ) return standard_path
[docs] def list_available_standards() -> list[str]: """ List all available standard versions. Returns: List of version strings (e.g., ["1.0.0"]) """ if not STANDARDS_DIR.exists(): return [] versions = [] for yaml_file in STANDARDS_DIR.glob("v*.yaml"): # Extract version from filename (e.g., "v1.0.0.yaml" -> "1.0.0") version = yaml_file.stem[1:] # Remove 'v' prefix versions.append(version) return sorted(versions)
# Import validator classes try: from trailpack.validation.standard_validator import StandardValidator, ValidationResult __all__ = [ "get_standard_path", "list_available_standards", "StandardValidator", "ValidationResult", "STANDARDS_DIR", ] except ImportError: # If dependencies not installed, just export utility functions __all__ = [ "get_standard_path", "list_available_standards", "STANDARDS_DIR", ]