diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 2ffe63b..e97e1ef 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -47,3 +47,38 @@ jobs: - name: Run ruff format check run: uv run ruff format --check . + + build-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Set up Python + run: uv python install 3.12 + + - name: Build package + run: uv build + + - name: Check build artifacts + run: | + uv pip install twine + uv run twine check dist/* + + web-check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install Bun + uses: oven-sh/setup-bun@v1 + + - name: Install dependencies + run: bun install + working-directory: web + + - name: Build website + run: bun run build + working-directory: web diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..b24a9a7 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,43 @@ +name: Release + +on: + push: + tags: + - 'v*' + +jobs: + publish: + runs-on: ubuntu-latest + permissions: + id-token: write # Mandatory for trusted publishing + contents: read + + steps: + - uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v4 + + - name: Set up Python + run: uv python install 3.12 + + - name: Build package + run: uv build + + - name: Publish to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 + # with: + # repository-url: https://upload.pypi.org/legacy/ # Optional: defaults to PyPI + + github-release: + runs-on: ubuntu-latest + needs: publish + permissions: + contents: write + steps: + - uses: actions/checkout@v4 + - name: Create GitHub Release + uses: softprops/action-gh-release@v2 + with: + generate_release_notes: true + files: dist/* diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..8b95fc1 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,36 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.1.0] - 2026-05-22 + +### Added +- First official stable release. +- Comprehensive dataset profiling and health checks. +- Support for multiple report formats: Markdown, JSON, HTML, and PDF. +- Two HTML report themes: `minimal` and `neubrutalism`. +- Automatic suggestion provider for data quality issues. +- Code generation for pandas fix scripts. +- Sklearn preprocessing pipeline generation. +- CLI commands: `scan`, `details`, `report`, `checks`, and `version`. +- Robust error handling for invalid files, empty datasets, and malformed configurations. +- Configuration support via YAML, TOML, and JSON. +- Intelligent sampling for large datasets. +- Dataset drift detection. +- Mutual information and statistical checks. + +### Changed +- Refactored report generators for robust lazy loading. +- Improved CLI output with better error messages and color-coded statuses. +- Updated documentation and website for stable release. + +### Fixed +- Fixed crash when generating reports for single-row datasets. +- Fixed dependency issues in report generation when optional libraries are missing. +- Fixed non-deterministic order in generated fix scripts. + +## [0.1.0b3] - 2026-04-15 +- Initial beta release with core features. diff --git a/README.md b/README.md index 608d81c..3e378a7 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ > [!NOTE] -> HashPrep is in **beta** (v0.1.0b3). Core features are fully tested with CI. The API may still evolve based on community feedback. +> HashPrep is now in its first stable release (v0.1.0). Core features are fully tested with CI. ## Overview @@ -130,6 +130,7 @@ hashprep report dataset.csv --format html --theme minimal ``` **Options:** +- `--output PATH`, `-o PATH`: Custom output file path - `--format {md,json,html,pdf}`: Report format (default: md) - `--theme {minimal,neubrutalism}`: HTML report theme (default: minimal) - `--with-code`: Generate Python scripts for fixes and pipelines @@ -153,6 +154,9 @@ hashprep report dataset.csv --format pdf --no-visualizations # Generate report with automatic fix scripts hashprep report dataset.csv --with-code +# Generate report with custom output path +hashprep report dataset.csv --format html --output my_reports/analysis.html + # This creates: # - dataset_hashprep_report.md (or .html/.pdf/.json) # - dataset_hashprep_report_fixes.py (pandas script) @@ -162,7 +166,13 @@ hashprep report dataset.csv --with-code hashprep report train.csv --comparison test.csv --format html ``` -#### 4. Version +#### 4. List Available Checks +Discover all data quality checks that HashPrep can perform. +```bash +hashprep checks +``` + +#### 5. Version Check HashPrep version. ```bash hashprep version diff --git a/RELEASE_TODO.md b/RELEASE_TODO.md new file mode 100644 index 0000000..74ee292 --- /dev/null +++ b/RELEASE_TODO.md @@ -0,0 +1,181 @@ +# HashPrep Official Release TODO + +## Release Scope + +- [x] Decide the official version target, likely `0.1.0` unless the release should signal `1.0.0`. +- [ ] Freeze public API expectations for `DatasetAnalyzer`. +- [ ] Freeze public API expectations for `HashPrepConfig`. +- [ ] Freeze public API expectations for `load_config`. +- [ ] Freeze public API expectations for `generate_report`. +- [x] Freeze CLI behavior for `scan`, `details`, `report`, and `version`. +- [ ] Define what is stable versus experimental, especially auto-fixes, generated pipelines, report themes, and statistical checks. + +## Code Readiness + +- [ ] Run the full test suite across Python `3.10`, `3.11`, and `3.12`. +- [ ] Run lint checks. +- [ ] Run format checks. +- [x] Add or verify tests for CLI error handling. +- [ ] Add or verify tests for invalid check names. +- [x] Add or verify tests for report generation in `md`, `json`, `html`, and `pdf`. +- [x] Add or verify tests for generated `fixes.py` code. +- [x] Add or verify tests for generated sklearn pipeline code. +- [x] Add or verify tests for config loading from YAML, TOML, and JSON. +- [ ] Add or verify tests for large dataset sampling behavior. +- [ ] Verify empty CSV behavior. +- [ ] Verify duplicate column behavior. +- [ ] Verify missing target column behavior. +- [ ] Verify non-numeric target behavior for mutual information and statistical checks. +- [ ] Verify datasets with infinities, all-null columns, and mixed types. +- [ ] Verify reports do not crash when plots are disabled. +- [ ] Verify reports do not crash when optional summary data is missing. + +## Functionality Readiness + +### Must Improve Before Stable + +- [x] Improve CLI error handling for invalid files. +- [x] Improve CLI error handling for empty CSVs. +- [x] Improve CLI error handling for bad target columns. +- [x] Improve CLI error handling for bad config files. +- [x] Improve CLI error handling for unsupported report formats. +- [x] Improve CLI error handling for failed PDF generation. +- [x] Ensure CLI failures produce clear user-facing messages. +- [ ] Ensure HTML reports work when no issues are found. +- [ ] Ensure PDF reports work when no issues are found. +- [ ] Ensure Markdown reports work when no issues are found. +- [ ] Ensure JSON reports work when no issues are found. +- [ ] Ensure all report formats work when plots are disabled. +- [ ] Ensure all report formats work for tiny datasets. +- [ ] Ensure all report formats work for mostly missing datasets. +- [ ] Ensure all report formats work when optional summaries are absent. +- [x] Verify generated fix scripts are deterministic. +- [x] Verify generated fix scripts are valid Python. +- [x] Verify generated sklearn pipeline code is deterministic. +- [x] Verify generated sklearn pipeline code is valid Python. +- [x] Add tests that execute generated fix scripts where practical. +- [x] Add tests that execute generated sklearn pipeline code where practical. +- [ ] Clearly label generated fixes as suggestions if they are heuristic or incomplete. +- [x] Add config validation for unknown keys. +- [x] Add config validation for wrong value types. +- [x] Add clear errors for malformed YAML, TOML, and JSON config files. +- [ ] Confirm threshold behavior is predictable and documented. +- [ ] Decide whether summary dictionary shapes are part of the stable public API. +- [ ] Document stable summary keys if summary dictionaries are part of the public API. + +### Strongly Recommended + +- [x] Add an `--output` option to `hashprep report`. +- [x] Allow `hashprep report data.csv --format html --output reports/data.html`. +- [ ] Add machine-readable JSON output for `hashprep details`. +- [x] Add check discovery through a command such as `hashprep checks`. +- [x] Alternatively add check discovery through an option such as `hashprep scan --list-checks`. +- [ ] Document why issues are classified as `critical` versus `warning`. +- [ ] Review whether PDF/reporting dependencies should move to optional extras in a future release. +- [ ] Document any dependency-extra plan if it is deferred. + +### Not For This Stable Release + +- [ ] Avoid adding major new check families before the first stable release unless they fix a release blocker. +- [ ] Avoid adding model integrations before the first stable release. +- [ ] Avoid adding dashboard features before the first stable release. +- [ ] Avoid expanding automatic dataset repair workflows before the first stable release. +- [ ] Keep the first stable release focused on hardening existing behavior. + +## Packaging + +- [x] Update `hashprep/__init__.py` from `0.1.0b3` to the official release version. +- [x] Update the beta note in `README.md`. +- [x] Update the beta support table in `SECURITY.md`. +- [x] Add or verify PyPI classifiers in `pyproject.toml`. +- [x] Add or verify package keywords in `pyproject.toml`. +- [x] Add or verify project URLs in `pyproject.toml`. +- [x] Add or verify Python version classifiers in `pyproject.toml`. +- [x] Add or verify license metadata in `pyproject.toml`. +- [ ] Build source distribution. +- [ ] Build wheel distribution. +- [ ] Inspect built package artifacts. +- [ ] Install the built wheel in a clean environment. +- [ ] Smoke test `import hashprep` from the built wheel. +- [ ] Smoke test `hashprep version` from the built wheel. +- [ ] Smoke test `hashprep scan datasets/train.csv` from the built wheel. +- [ ] Smoke test `hashprep report datasets/train.csv --format html` from the built wheel. +- [ ] Confirm `MANIFEST.in` excludes dev/demo files intentionally. +- [ ] Confirm `MANIFEST.in` includes all runtime files needed by reports and templates. + +## Documentation + +- [x] Replace beta references in `README.md`. +- [x] Replace beta references in `SECURITY.md`. +- [x] Replace beta references in `web/src/lib/components/Hero.svelte`. +- [x] Add `CHANGELOG.md`. +- [ ] Add first official release notes. +- [ ] Document available checks in release notes. +- [ ] Document CLI commands in release notes. +- [ ] Document report formats in release notes. +- [ ] Document known limitations in release notes. +- [ ] Document upgrade notes from beta. +- [ ] Verify README examples run exactly as written. +- [ ] Update the documentation URL in `pyproject.toml` if a dedicated docs page is available. +- [ ] Refresh generated example reports under `examples/reports/` if needed. + +## CI/CD + +- [x] Add package build validation to CI. +- [x] Add `twine check` or equivalent artifact validation to CI. +- [ ] Add built-wheel install smoke test to CI. +- [ ] Add CLI smoke test against the built wheel to CI. +- [x] Add website build check for `web/`. +- [x] Add or verify release workflow triggered by version tags. +- [ ] Configure PyPI publishing, preferably with trusted publishing. +- [x] Configure GitHub release creation. +- [ ] Consider adding dependency and security scanning for Python dependencies. +- [ ] Consider adding dependency and security scanning for web dependencies. + +## Security And Dependencies + +- [ ] Review pinned and minimum Python dependency versions. +- [ ] Review pinned and minimum web dependency versions. +- [ ] Confirm heavy dependencies are intentional, especially `weasyprint`, `matplotlib`, `seaborn`, and `scikit-learn`. +- [ ] Decide whether PDF/report dependencies should remain core dependencies or move to optional extras in a future release. +- [ ] Run vulnerability checks for Python dependencies. +- [ ] Run vulnerability checks for web dependencies. +- [x] Update `SECURITY.md` to describe stable release support. + +## Website + +- [x] Update website beta badges. +- [ ] Update website install examples. +- [ ] Confirm the docs page matches the current README and API. +- [x] Build the static site successfully. +- [ ] Verify PyPI link. +- [ ] Verify GitHub link. +- [ ] Verify docs link. +- [ ] Verify license link. +- [ ] Verify issue tracker link. +- [ ] Decide deployment target for the docs site. +- [ ] Decide release timing for the docs site. + +## Release Process + +- [ ] Create a release branch. +- [ ] Make version, docs, and changelog updates. +- [ ] Run full validation locally. +- [ ] Merge after CI passes. +- [ ] Tag the release, for example `v0.1.0`. +- [ ] Publish to PyPI. +- [ ] Create GitHub release with release notes. +- [ ] Verify public install with `pip install hashprep`. +- [ ] Verify public CLI with `hashprep version`. +- [ ] Announce the release as the first stable release after alpha and beta. + +## Known Immediate Gaps + +- [x] Version is still `0.1.0b3`. +- [x] `README.md` still says beta. +- [x] Website hero still says beta and shows `hashprep-0.1.0b3`. +- [x] `SECURITY.md` only describes beta support. +- [x] `CHANGELOG.md` is not present. +- [x] CI does not currently build or check publish artifacts. +- [x] CI does not currently build the Svelte website. +- [x] No release or publish workflow is present. diff --git a/SECURITY.md b/SECURITY.md index 16efa74..4255a0d 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -2,12 +2,12 @@ ## Supported Versions -hashprep is currently in beta (`0.1.0bX`). Only the latest beta release on the `main` branch receives security updates. Older pre-releases are not patched — please upgrade to the newest version to pick up fixes. +hashprep has reached a stable `0.1.0` release. Only the latest minor release is supported for security updates. | Version | Supported | | ---------- | ------------------ | -| `0.1.0b3` | :white_check_mark: | -| `< 0.1.0b3`| :x: | +| `0.1.0` | :white_check_mark: | +| `< 0.1.0` | :x: | Once hashprep reaches a stable `0.1.0` release, this table will be updated to reflect supported minor versions. diff --git a/hashprep/__init__.py b/hashprep/__init__.py index de55b79..980f0ce 100644 --- a/hashprep/__init__.py +++ b/hashprep/__init__.py @@ -2,4 +2,4 @@ from .core.analyzer import DatasetAnalyzer as DatasetAnalyzer from .utils.config_loader import load_config as load_config -__version__ = "0.1.0b3" +__version__ = "0.1.0" diff --git a/hashprep/config.py b/hashprep/config.py index 3bd4c84..738bc81 100644 --- a/hashprep/config.py +++ b/hashprep/config.py @@ -236,21 +236,44 @@ class HashPrepConfig: def config_from_dict(d: dict) -> "HashPrepConfig": """Build a HashPrepConfig from a (possibly partial) nested dict. - Unknown keys are silently ignored; missing keys fall back to defaults. + Raises ValueError for unknown keys or wrong value types. """ default = HashPrepConfig() - def _merge(cls, default_obj, overrides: dict): + def _merge(cls, default_obj, overrides: dict, path=""): kwargs = {} + # Check for unknown keys + allowed_keys = {f.name for f in _fields(cls)} + for k in overrides: + if k not in allowed_keys: + full_path = f"{path}.{k}" if path else k + raise ValueError(f"Unknown configuration key: {full_path}") + for f in _fields(cls): if f.name not in overrides: kwargs[f.name] = getattr(default_obj, f.name) else: val = overrides[f.name] field_default = getattr(default_obj, f.name) - if hasattr(field_default, "__dataclass_fields__") and isinstance(val, dict): - kwargs[f.name] = _merge(type(field_default), field_default, val) + full_path = f"{path}.{f.name}" if path else f.name + + if hasattr(field_default, "__dataclass_fields__"): + if not isinstance(val, dict): + raise TypeError(f"Configuration key '{full_path}' must be a mapping, got {type(val).__name__}") + kwargs[f.name] = _merge(type(field_default), field_default, val, full_path) else: + # Basic type validation + # Note: f.type might be a string if from __future__ import annotations is used, + # but here we can check against the default value's type. + expected_type = type(field_default) + if field_default is not None and not isinstance(val, expected_type): + # Allow float for int if it's a whole number + if expected_type is float and isinstance(val, int): + val = float(val) + else: + raise TypeError( + f"Configuration key '{full_path}' expected type {expected_type.__name__}, got {type(val).__name__}" + ) kwargs[f.name] = val return cls(**kwargs) diff --git a/hashprep/interfaces/cli/main.py b/hashprep/interfaces/cli/main.py index 7959a15..6580306 100644 --- a/hashprep/interfaces/cli/main.py +++ b/hashprep/interfaces/cli/main.py @@ -1,5 +1,7 @@ import json import os +import sys +from contextlib import contextmanager import click import fuzzybunny @@ -37,6 +39,42 @@ def suggest_check_names(invalid_check, valid_checks, cutoff=0.4): return suggestions +@contextmanager +def error_handler(): + try: + yield + except FileNotFoundError as e: + click.secho(f"Error: {e}", fg="red", err=True) + sys.exit(1) + except pd.errors.EmptyDataError: + click.secho("Error: The CSV file is empty or has no columns to parse.", fg="red", err=True) + sys.exit(1) + except pd.errors.ParserError as e: + click.secho(f"Error: Failed to parse CSV: {e}", fg="red", err=True) + sys.exit(1) + except ValueError as e: + click.secho(f"Error: {e}", fg="red", err=True) + sys.exit(1) + except TypeError as e: + click.secho(f"Error: {e}", fg="red", err=True) + sys.exit(1) + except ImportError as e: + click.secho(f"Error: {e}", fg="red", err=True) + sys.exit(1) + except ZeroDivisionError: + click.secho( + "Error: Division by zero occurred during analysis. This often happens with empty or single-row datasets.", + fg="red", + err=True, + ) + sys.exit(1) + except Exception as e: + click.secho(f"An unexpected error occurred: {e}", fg="red", err=True) + if os.environ.get("HASHPREP_DEBUG"): + raise + sys.exit(1) + + @click.group() def cli(): pass @@ -47,6 +85,15 @@ def version(): click.echo(f"HashPrep Version: {hashprep.__version__}") +@cli.command() +def checks(): + """List all available data quality checks.""" + click.echo("Available Checks:") + for check in sorted(DatasetAnalyzer.ALL_CHECKS): + click.echo(f"- {check}") + click.echo("\nUse these with the --checks option in 'scan', 'details', or 'report'.") + + @cli.command() @click.argument("file_path", type=click.Path(exists=True)) @click.option("--critical-only", is_flag=True, help="Show only critical issues") @@ -79,79 +126,80 @@ def version(): help="Path to config file (.yaml, .toml, .json)", ) def scan(file_path, critical_only, quiet, json_out, target, checks, comparison, sample_size, no_sample, config_path): - df = pd.read_csv(file_path) - comparison_df = pd.read_csv(comparison) if comparison else None - - selected_checks = checks.split(",") if checks else None - valid_checks = DatasetAnalyzer.ALL_CHECKS - if selected_checks: - invalid_checks = [c for c in selected_checks if c not in valid_checks] - if invalid_checks: - click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") - for invalid in invalid_checks: - suggestions = suggest_check_names(invalid, valid_checks) - if suggestions: - click.echo(f" Did you mean: {', '.join(suggestions)}?") - selected_checks = [c for c in selected_checks if c in valid_checks] - - sampling_config = None - if not no_sample and sample_size: - sampling_config = SamplingConfig(max_rows=sample_size) - - config = load_config(config_path) if config_path else None - analyzer = DatasetAnalyzer( - df, - target_col=target, - selected_checks=selected_checks, - comparison_df=comparison_df, - sampling_config=sampling_config, - auto_sample=not no_sample, - config=config, - ) - summary = analyzer.analyze() - - issues = summary["issues"] - critical = [i for i in issues if i["severity"] == "critical"] - warnings = [i for i in issues if i["severity"] == "warning"] - - if json_out: - json_data = { - "critical_issues": len(critical), - "warnings": len(warnings), - "issues": [{"type": i["severity"], **i} for i in issues], - "recommendations": [i["quick_fix"] for i in issues], - } - if "sampling_info" in summary: - json_data["sampling_info"] = summary["sampling_info"] - click.echo(json.dumps(json_data, default=json_numpy_handler)) - return - - if quiet: - click.echo(f"CRITICAL ISSUES: {len(critical)}, WARNINGS: {len(warnings)}") - return - - click.echo(f"Dataset Health Check: {file_path}") - click.echo( - f"Size: {summary['summaries']['dataset_info']['rows']} rows x {summary['summaries']['dataset_info']['columns']} columns" - ) - - if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): - info = summary["sampling_info"] - click.echo(f"Sampled: {info['sample_fraction'] * 100:.1f}% of {info['original_rows']} rows") - - if critical_only: - click.echo("Critical Issues:") - for i, issue in enumerate(critical, 1): - click.echo(f"{i}. {issue['description']}") - return + with error_handler(): + df = pd.read_csv(file_path) + comparison_df = pd.read_csv(comparison) if comparison else None + + selected_checks = checks.split(",") if checks else None + valid_checks = DatasetAnalyzer.ALL_CHECKS + if selected_checks: + invalid_checks = [c for c in selected_checks if c not in valid_checks] + if invalid_checks: + click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") + for invalid in invalid_checks: + suggestions = suggest_check_names(invalid, valid_checks) + if suggestions: + click.echo(f" Did you mean: {', '.join(suggestions)}?") + selected_checks = [c for c in selected_checks if c in valid_checks] + + sampling_config = None + if not no_sample and sample_size: + sampling_config = SamplingConfig(max_rows=sample_size) + + config = load_config(config_path) if config_path else None + analyzer = DatasetAnalyzer( + df, + target_col=target, + selected_checks=selected_checks, + comparison_df=comparison_df, + sampling_config=sampling_config, + auto_sample=not no_sample, + config=config, + ) + summary = analyzer.analyze() + + issues = summary["issues"] + critical = [i for i in issues if i["severity"] == "critical"] + warnings = [i for i in issues if i["severity"] == "warning"] + + if json_out: + json_data = { + "critical_issues": len(critical), + "warnings": len(warnings), + "issues": [{"type": i["severity"], **i} for i in issues], + "recommendations": [i["quick_fix"] for i in issues], + } + if "sampling_info" in summary: + json_data["sampling_info"] = summary["sampling_info"] + click.echo(json.dumps(json_data, default=json_numpy_handler)) + return + + if quiet: + click.echo(f"CRITICAL ISSUES: {len(critical)}, WARNINGS: {len(warnings)}") + return + + click.echo(f"Dataset Health Check: {file_path}") + click.echo( + f"Size: {summary['summaries']['dataset_info']['rows']} rows x {summary['summaries']['dataset_info']['columns']} columns" + ) + + if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): + info = summary["sampling_info"] + click.echo(f"Sampled: {info['sample_fraction'] * 100:.1f}% of {info['original_rows']} rows") + + if critical_only: + click.echo("Critical Issues:") + for i, issue in enumerate(critical, 1): + click.echo(f"{i}. {issue['description']}") + return - click.echo("Critical Issues:") - for issue in critical: - click.echo(f"- {issue['description']}") - click.echo("Warnings:") - for issue in warnings: - click.echo(f"- {issue['description']}") - click.echo("Next steps: Run 'hashprep details' or 'hashprep report' for more info.") + click.echo("Critical Issues:") + for issue in critical: + click.echo(f"- {issue['description']}") + click.echo("Warnings:") + for issue in warnings: + click.echo(f"- {issue['description']}") + click.echo("Next steps: Run 'hashprep details' or 'hashprep report' for more info.") @cli.command() @@ -183,90 +231,92 @@ def scan(file_path, critical_only, quiet, json_out, target, checks, comparison, help="Path to config file (.yaml, .toml, .json)", ) def details(file_path, target, checks, comparison, sample_size, no_sample, config_path): - df = pd.read_csv(file_path) - comparison_df = pd.read_csv(comparison) if comparison else None - - selected_checks = checks.split(",") if checks else None - valid_checks = DatasetAnalyzer.ALL_CHECKS - if selected_checks: - invalid_checks = [c for c in selected_checks if c not in valid_checks] - if invalid_checks: - click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") - for invalid in invalid_checks: - suggestions = suggest_check_names(invalid, valid_checks) - if suggestions: - click.echo(f" Did you mean: {', '.join(suggestions)}?") - selected_checks = [c for c in selected_checks if c in valid_checks] - - sampling_config = None - if not no_sample and sample_size: - sampling_config = SamplingConfig(max_rows=sample_size) - - config = load_config(config_path) if config_path else None - analyzer = DatasetAnalyzer( - df, - target_col=target, - selected_checks=selected_checks, - comparison_df=comparison_df, - sampling_config=sampling_config, - auto_sample=not no_sample, - config=config, - ) - summary = analyzer.analyze() - - issues = summary["issues"] - critical = [i for i in issues if i["severity"] == "critical"] - warnings = [i for i in issues if i["severity"] == "warning"] - - click.echo(f"Detailed Analysis: {file_path}") - - if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): - info = summary["sampling_info"] - click.echo( - f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample ({int(info['original_rows'] * info['sample_fraction'])} of {info['original_rows']} rows)" + with error_handler(): + df = pd.read_csv(file_path) + comparison_df = pd.read_csv(comparison) if comparison else None + + selected_checks = checks.split(",") if checks else None + valid_checks = DatasetAnalyzer.ALL_CHECKS + if selected_checks: + invalid_checks = [c for c in selected_checks if c not in valid_checks] + if invalid_checks: + click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") + for invalid in invalid_checks: + suggestions = suggest_check_names(invalid, valid_checks) + if suggestions: + click.echo(f" Did you mean: {', '.join(suggestions)}?") + selected_checks = [c for c in selected_checks if c in valid_checks] + + sampling_config = None + if not no_sample and sample_size: + sampling_config = SamplingConfig(max_rows=sample_size) + + config = load_config(config_path) if config_path else None + analyzer = DatasetAnalyzer( + df, + target_col=target, + selected_checks=selected_checks, + comparison_df=comparison_df, + sampling_config=sampling_config, + auto_sample=not no_sample, + config=config, ) + summary = analyzer.analyze() - click.echo("\nCritical Issues:") - for i, issue in enumerate(critical, 1): - click.echo(f"{i}. {issue['category'].upper()} - '{issue['column']}'") - click.echo(f" Description: {issue['description']}") - click.echo(f" Impact: {issue['impact_score'].capitalize()}") - click.echo(f" Quick fix: {issue['quick_fix']}") - - click.echo("\nWarnings:") - for i, issue in enumerate(warnings, 1): - click.echo(f"{i}. {issue['category'].upper()}") - click.echo(f" Description: {issue['description']}") - click.echo(f" Impact: {issue['impact_score'].capitalize()}") - click.echo(f" Quick fix: {issue['quick_fix']}") - - click.echo("\nDataset Summary:") - info = summary["summaries"]["dataset_info"] - click.echo(f"- Rows: {info['rows']}") - click.echo(f"- Columns: {info['columns']}") - click.echo(f"- Memory: ~{info['memory_mb']} MB") - click.echo(f"- Missing: {info['missing_cells']} ({info['missing_percentage']} %)") - click.echo("- Variable Types:") - for col, typ in summary["summaries"]["variable_types"].items(): - click.echo(f" {col}: {typ}") - click.echo("- Missing Values (by column):") - for col, pct in sorted( - summary["summaries"]["missing_values"]["percentage"].items(), - key=lambda x: x[1], - reverse=True, - ): - if pct > 0: - click.echo(f" {col}: {pct}%") - repro = summary["summaries"]["reproduction_info"] - click.echo(f"- Dataset Hash: {repro['dataset_hash']}") - if "analysis_started" in repro and repro["analysis_started"]: - click.echo(f"- Analysis Started: {repro['analysis_started'][:19]}") - if "duration_seconds" in repro: - click.echo(f"- Duration: {repro['duration_seconds']} seconds") + issues = summary["issues"] + critical = [i for i in issues if i["severity"] == "critical"] + warnings = [i for i in issues if i["severity"] == "warning"] + + click.echo(f"Detailed Analysis: {file_path}") + + if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): + info = summary["sampling_info"] + click.echo( + f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample ({int(info['original_rows'] * info['sample_fraction'])} of {info['original_rows']} rows)" + ) + + click.echo("\nCritical Issues:") + for i, issue in enumerate(critical, 1): + click.echo(f"{i}. {issue['category'].upper()} - '{issue['column']}'") + click.echo(f" Description: {issue['description']}") + click.echo(f" Impact: {issue['impact_score'].capitalize()}") + click.echo(f" Quick fix: {issue['quick_fix']}") + + click.echo("\nWarnings:") + for i, issue in enumerate(warnings, 1): + click.echo(f"{i}. {issue['category'].upper()}") + click.echo(f" Description: {issue['description']}") + click.echo(f" Impact: {issue['impact_score'].capitalize()}") + click.echo(f" Quick fix: {issue['quick_fix']}") + + click.echo("\nDataset Summary:") + info = summary["summaries"]["dataset_info"] + click.echo(f"- Rows: {info['rows']}") + click.echo(f"- Columns: {info['columns']}") + click.echo(f"- Memory: ~{info['memory_mb']} MB") + click.echo(f"- Missing: {info['missing_cells']} ({info['missing_percentage']} %)") + click.echo("- Variable Types:") + for col, typ in summary["summaries"]["variable_types"].items(): + click.echo(f" {col}: {typ}") + click.echo("- Missing Values (by column):") + for col, pct in sorted( + summary["summaries"]["missing_values"]["percentage"].items(), + key=lambda x: x[1], + reverse=True, + ): + if pct > 0: + click.echo(f" {col}: {pct}%") + repro = summary["summaries"]["reproduction_info"] + click.echo(f"- Dataset Hash: {repro['dataset_hash']}") + if "analysis_started" in repro and repro["analysis_started"]: + click.echo(f"- Analysis Started: {repro['analysis_started'][:19]}") + if "duration_seconds" in repro: + click.echo(f"- Duration: {repro['duration_seconds']} seconds") @cli.command() @click.argument("file_path", type=click.Path(exists=True)) +@click.option("--output", "-o", default=None, help="Output file path for the report") @click.option("--with-code", is_flag=True, help="Generate fixes.py and pipeline.py scripts") @click.option("--full/--no-full", default=True, help="Include full summaries in report (default: True)") @click.option("--format", default="md", help="Report format: md, json, html, pdf") @@ -304,6 +354,7 @@ def details(file_path, target, checks, comparison, sample_size, no_sample, confi ) def report( file_path, + output, with_code, full, format, @@ -316,80 +367,84 @@ def report( no_sample, config_path, ): - df = pd.read_csv(file_path) - comparison_df = pd.read_csv(comparison) if comparison else None - - selected_checks = checks.split(",") if checks else None - valid_checks = DatasetAnalyzer.ALL_CHECKS - if selected_checks: - invalid_checks = [c for c in selected_checks if c not in valid_checks] - if invalid_checks: - click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") - for invalid in invalid_checks: - suggestions = suggest_check_names(invalid, valid_checks) - if suggestions: - click.echo(f" Did you mean: {', '.join(suggestions)}?") - selected_checks = [c for c in selected_checks if c in valid_checks] - - sampling_config = None - if not no_sample and sample_size: - sampling_config = SamplingConfig(max_rows=sample_size) - - config = load_config(config_path) if config_path else None - analyzer = DatasetAnalyzer( - df, - target_col=target, - selected_checks=selected_checks, - include_plots=visualizations, - comparison_df=comparison_df, - sampling_config=sampling_config, - auto_sample=not no_sample, - config=config, - ) - summary = analyzer.analyze() - - base_name = os.path.splitext(os.path.basename(file_path))[0] + "_hashprep_report" - # Save to current working directory by default - report_file = f"{base_name}.{format}" - - generate_report( - summary, - format=format, - full=full, - output_file=report_file, - theme=theme, - ) - click.echo(f"Report saved to: {report_file}") - click.echo(f"Summary: {summary['critical_count']} critical, {summary['warning_count']} warnings") - - if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): - info = summary["sampling_info"] - click.echo(f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample") - - if with_code: - issues = [Issue(**i) for i in summary["issues"]] - column_types = summary.get("column_types", {}) - - provider = SuggestionProvider( - issues=issues, - column_types=column_types, + with error_handler(): + df = pd.read_csv(file_path) + comparison_df = pd.read_csv(comparison) if comparison else None + + selected_checks = checks.split(",") if checks else None + valid_checks = DatasetAnalyzer.ALL_CHECKS + if selected_checks: + invalid_checks = [c for c in selected_checks if c not in valid_checks] + if invalid_checks: + click.echo(f"Warning: Invalid checks ignored: {', '.join(invalid_checks)}") + for invalid in invalid_checks: + suggestions = suggest_check_names(invalid, valid_checks) + if suggestions: + click.echo(f" Did you mean: {', '.join(suggestions)}?") + selected_checks = [c for c in selected_checks if c in valid_checks] + + sampling_config = None + if not no_sample and sample_size: + sampling_config = SamplingConfig(max_rows=sample_size) + + config = load_config(config_path) if config_path else None + analyzer = DatasetAnalyzer( + df, target_col=target, + selected_checks=selected_checks, + include_plots=visualizations, + comparison_df=comparison_df, + sampling_config=sampling_config, + auto_sample=not no_sample, + config=config, + ) + summary = analyzer.analyze() + + if output: + report_file = output + else: + base_name = os.path.splitext(os.path.basename(file_path))[0] + "_hashprep_report" + # Save to current working directory by default + report_file = f"{base_name}.{format}" + + generate_report( + summary, + format=format, + full=full, + output_file=report_file, + theme=theme, ) - suggestions = provider.get_suggestions() - - codegen = CodeGenerator(suggestions) - fixes_file = f"{base_name}_fixes.py" - fixes_code = codegen.generate_pandas_script() - with open(fixes_file, "w") as f: - f.write(fixes_code) - click.echo(f"Pandas fixes script saved to: {fixes_file}") - - builder = PipelineBuilder(suggestions) - pipeline_file = f"{base_name}_pipeline.py" - pipeline_code = builder.generate_pipeline_code() - with open(pipeline_file, "w") as f: - f.write(pipeline_code) - click.echo(f"sklearn pipeline script saved to: {pipeline_file}") + click.echo(f"Report saved to: {report_file}") + click.echo(f"Summary: {summary['critical_count']} critical, {summary['warning_count']} warnings") + + if "sampling_info" in summary and summary["sampling_info"].get("was_sampled"): + info = summary["sampling_info"] + click.echo(f"Note: Analysis performed on {info['sample_fraction'] * 100:.1f}% sample") + + if with_code: + issues = [Issue(**i) for i in summary["issues"]] + column_types = summary.get("column_types", {}) + + provider = SuggestionProvider( + issues=issues, + column_types=column_types, + target_col=target, + ) + suggestions = provider.get_suggestions() + + codegen = CodeGenerator(suggestions) + fixes_file = f"{base_name}_fixes.py" + fixes_code = codegen.generate_pandas_script() + with open(fixes_file, "w") as f: + f.write(fixes_code) + click.echo(f"Pandas fixes script saved to: {fixes_file}") + + builder = PipelineBuilder(suggestions) + pipeline_file = f"{base_name}_pipeline.py" + pipeline_code = builder.generate_pipeline_code() + with open(pipeline_file, "w") as f: + f.write(pipeline_code) + click.echo(f"sklearn pipeline script saved to: {pipeline_file}") if __name__ == "__main__": diff --git a/hashprep/preparers/suggestions.py b/hashprep/preparers/suggestions.py index bb9dc3e..407b027 100644 --- a/hashprep/preparers/suggestions.py +++ b/hashprep/preparers/suggestions.py @@ -40,7 +40,7 @@ def get_suggestions(self) -> list[FixSuggestion]: suggestions.append(suggestion) seen_columns.add(col_key) - return sorted(suggestions, key=lambda s: s.priority) + return sorted(suggestions, key=lambda s: (s.priority, s.columns[0] if s.columns else "")) def get_suggestions_by_type(self) -> dict[str, list[FixSuggestion]]: """Group suggestions by fix type for organized output.""" diff --git a/hashprep/reports/generators.py b/hashprep/reports/generators.py index a52f8af..d81e7d9 100644 --- a/hashprep/reports/generators.py +++ b/hashprep/reports/generators.py @@ -8,31 +8,44 @@ def generate(self, summary, full=False, output_file=None): # Lazy loading report classes -def _load_generators(): - from .html import HtmlReport - from .json import JsonReport - from .markdown import MarkdownReport - from .pdf import PdfReport +def _load_generator(format_name): + if format_name == "md": + from .markdown import MarkdownReport - return { - "md": MarkdownReport(), - "json": JsonReport(), - "html": HtmlReport(), - "pdf": PdfReport(), - } + return MarkdownReport() + elif format_name == "json": + from .json import JsonReport + + return JsonReport() + elif format_name == "html": + from .html import HtmlReport + + return HtmlReport() + elif format_name == "pdf": + try: + from .pdf import PdfReport + + return PdfReport() + except Exception as e: + # Re-raise as a cleaner error for the CLI to catch + raise ImportError(f"PDF generation is unavailable because of a missing dependency: {e}") from e + return None # get generators dictionary def get_generators(): if not hasattr(get_generators, "cache"): - get_generators.cache = _load_generators() + get_generators.cache = {} return get_generators.cache def generate_report(summary, format="md", full=False, output_file=None, theme="minimal"): generators = get_generators() if format not in generators: - raise ValueError(f"Unsupported format: {format}") + gen = _load_generator(format) + if gen is None: + raise ValueError(f"Unsupported format: {format}") + generators[format] = gen if format in ["html", "pdf"]: return generators[format].generate(summary, full, output_file, theme=theme) diff --git a/hashprep/utils/config_loader.py b/hashprep/utils/config_loader.py index 53b56ac..9c19f6c 100644 --- a/hashprep/utils/config_loader.py +++ b/hashprep/utils/config_loader.py @@ -24,8 +24,11 @@ def load_config(path: str | Path) -> HashPrepConfig: import yaml except ImportError as e: raise ImportError("pyyaml is required for YAML config files: pip install pyyaml") from e - with open(path) as f: - raw = yaml.safe_load(f) or {} + try: + with open(path) as f: + raw = yaml.safe_load(f) or {} + except yaml.YAMLError as e: + raise ValueError(f"Malformed YAML config file: {path}\n{e}") from e elif suffix == ".toml": try: import tomllib @@ -36,11 +39,17 @@ def load_config(path: str | Path) -> HashPrepConfig: raise ImportError( "tomllib (Python 3.11+) or tomli is required for TOML config files: pip install tomli" ) from e - with open(path, "rb") as f: - raw = tomllib.load(f) + try: + with open(path, "rb") as f: + raw = tomllib.load(f) + except Exception as e: + raise ValueError(f"Malformed TOML config file: {path}\n{e}") from e elif suffix == ".json": - with open(path) as f: - raw = json.load(f) + try: + with open(path) as f: + raw = json.load(f) + except json.JSONDecodeError as e: + raise ValueError(f"Malformed JSON config file: {path}\n{e}") from e else: raise ValueError(f"Unsupported config file format: {suffix!r}. Use .yaml, .yml, .toml, or .json") diff --git a/pyproject.toml b/pyproject.toml index 62f5d2c..5d67f4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,18 @@ description = "A library for dataset quality checks, preprocessing, and report g readme = "README.md" requires-python = ">=3.10" license = { file = "LICENSE" } +keywords = ["data-science", "machine-learning", "eda", "data-cleaning", "profiling"] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Science/Research", + "Intended Audience :: Developers", + "License :: OSI Approved :: MIT License", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Information Analysis", +] authors = [ { name = "Aftaab Siddiqui (MaskedSyntax)", email = "aftaab@aftaab.xyz"} diff --git a/tests/test_cli.py b/tests/test_cli.py index b2b1e91..cb2794f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -11,6 +11,20 @@ import pytest +def has_pdf_deps(): + try: + # We need to check if we can actually run a PDF report + from hashprep.reports.pdf import PdfReport + + PdfReport() + return True + except Exception: + return False + + +skip_pdf = pytest.mark.skipif(not has_pdf_deps(), reason="PDF dependencies missing") + + @pytest.fixture def titanic_csv(): """Path to titanic dataset.""" @@ -172,6 +186,7 @@ def test_report_html_neubrutalism(self, titanic_csv, temp_output_dir): assert result.returncode == 0 assert "train_hashprep_report.html" in result.stdout + @skip_pdf def test_report_pdf(self, titanic_csv, temp_output_dir): """Test PDF report generation.""" result = run_cli(["report", titanic_csv, "--format", "pdf", "--full"], cwd=temp_output_dir) @@ -247,6 +262,30 @@ def test_invalid_file(self): assert result.returncode != 0 + def test_empty_csv(self, temp_output_dir): + """Test with empty CSV.""" + empty_file = os.path.join(temp_output_dir, "empty.csv") + with open(empty_file, "w"): + pass + result = run_cli(["scan", empty_file]) + assert result.returncode != 0 + assert "empty" in result.stderr.lower() + + def test_bad_target(self, titanic_csv): + """Test with non-existent target column.""" + result = run_cli(["scan", titanic_csv, "--target", "nonexistent"]) + assert result.returncode != 0 + assert "not found" in result.stderr.lower() + + def test_malformed_config(self, titanic_csv, temp_output_dir): + """Test with malformed YAML config.""" + bad_config = os.path.join(temp_output_dir, "bad.yaml") + with open(bad_config, "w") as f: + f.write("invalid: yaml: [") + result = run_cli(["scan", titanic_csv, "--config", bad_config]) + assert result.returncode != 0 + assert "malformed" in result.stderr.lower() + def test_invalid_format(self, titanic_csv): """Test with invalid report format.""" result = run_cli(["report", titanic_csv, "--format", "invalid"]) @@ -260,8 +299,6 @@ def test_invalid_check_name(self, titanic_csv): assert result.returncode == 0 assert "Warning: Invalid checks ignored" in result.stdout - # Fuzzy suggestion feature (if merged) - # assert 'Did you mean' in result.stdout class TestCLIIntegration: @@ -279,6 +316,8 @@ def test_full_workflow(self, titanic_csv, temp_output_dir): # Step 3: Generate all report formats for fmt in ["md", "json", "html", "pdf"]: + if fmt == "pdf" and not has_pdf_deps(): + continue result = run_cli(["report", titanic_csv, "--format", fmt, "--full"], cwd=temp_output_dir) assert result.returncode == 0 diff --git a/tests/test_code_execution.py b/tests/test_code_execution.py new file mode 100644 index 0000000..370d3a7 --- /dev/null +++ b/tests/test_code_execution.py @@ -0,0 +1,105 @@ +import numpy as np +import pandas as pd +import pytest + +from hashprep.preparers.codegen import CodeGenerator +from hashprep.preparers.models import ( + EncodeMethod, + FixSuggestion, + FixType, + ImputeMethod, + ScaleMethod, +) +from hashprep.preparers.pipeline_builder import PipelineBuilder + + +def test_execute_pandas_fixes(): + suggestions = [ + FixSuggestion( + fix_type=FixType.DROP_COLUMN, + columns=["col_to_drop"], + reason="Empty", + ), + FixSuggestion( + fix_type=FixType.IMPUTE, + columns=["col_to_impute"], + method=ImputeMethod.MEAN.value, + reason="Missing", + ), + FixSuggestion( + fix_type=FixType.ENCODE, + columns=["col_to_encode"], + method=EncodeMethod.ONEHOT.value, + reason="Categorical", + ), + ] + + df = pd.DataFrame( + { + "col_to_drop": [np.nan, np.nan, np.nan], + "col_to_impute": [1.0, np.nan, 3.0], + "col_to_encode": ["A", "B", "A"], + "other_col": [10, 20, 30], + } + ) + + gen = CodeGenerator(suggestions) + code = gen.generate_pandas_script() + + # Define a namespace to execute the code + namespace = {} + exec(code, namespace) + + # Call apply_fixes + apply_fixes = namespace["apply_fixes"] + result_df = apply_fixes(df.copy()) + + assert "col_to_drop" not in result_df.columns + assert result_df["col_to_impute"].isnull().sum() == 0 + assert result_df["col_to_impute"].iloc[1] == 2.0 # Mean of 1.0 and 3.0 + # One-hot encoding should create new columns + assert any("col_to_encode" in col for col in result_df.columns) + + +@pytest.mark.skipif( + not pytest.importorskip("sklearn", reason="sklearn not installed"), + reason="sklearn not installed", +) +def test_execute_sklearn_pipeline(): + suggestions = [ + FixSuggestion( + fix_type=FixType.IMPUTE, + columns=["col_to_impute"], + method=ImputeMethod.MEDIAN.value, + reason="Missing", + ), + FixSuggestion( + fix_type=FixType.SCALE, + columns=["col_to_scale"], + method=ScaleMethod.STANDARD.value, + reason="Scale", + ), + ] + + df = pd.DataFrame( + { + "col_to_impute": [10.0, np.nan, 30.0], + "col_to_scale": [1.0, 2.0, 3.0], + } + ) + + builder = PipelineBuilder(suggestions) + code = builder.generate_pipeline_code() + + namespace = {} + exec(code, namespace) + + build_pipeline = namespace["build_preprocessing_pipeline"] + pipeline = build_pipeline() + + result = pipeline.fit_transform(df) + + assert result.shape == (3, 2) + assert not np.any(np.isnan(result)) + # Check scaling (mean should be approx 0) + assert np.abs(np.mean(result[:, 1])) < 1e-7 diff --git a/tests/test_config_loader.py b/tests/test_config_loader.py index 86eecd1..6ef4113 100644 --- a/tests/test_config_loader.py +++ b/tests/test_config_loader.py @@ -44,13 +44,14 @@ def test_multiple_section_override(self): assert cfg.correlations == DEFAULT_CONFIG.correlations def test_unknown_keys_are_ignored(self): - # Should not raise - cfg = config_from_dict({"nonexistent_section": {"foo": 1}}) - assert cfg == DEFAULT_CONFIG + """Unknown keys should now raise ValueError.""" + with pytest.raises(ValueError, match="Unknown configuration key"): + config_from_dict({"nonexistent_section": {"foo": 1}}) def test_unknown_nested_keys_are_ignored(self): - cfg = config_from_dict({"outliers": {"z_score": 3.0, "nonexistent": 99}}) - assert cfg.outliers.z_score == 3.0 + """Unknown nested keys should now raise ValueError.""" + with pytest.raises(ValueError, match="Unknown configuration key"): + config_from_dict({"outliers": {"z_score": 3.0, "nonexistent": 99}}) def test_returns_hashprepconfig_instance(self): cfg = config_from_dict({}) diff --git a/tests/test_library_api.py b/tests/test_library_api.py index 2dfdfa8..fee87be 100644 --- a/tests/test_library_api.py +++ b/tests/test_library_api.py @@ -15,6 +15,20 @@ from hashprep.utils.sampling import SamplingConfig +def has_pdf_deps(): + try: + from hashprep.reports.pdf import PdfReport + + # Try to initialize it to catch weasyprint dependency issues + PdfReport() + return True + except Exception: + return False + + +skip_pdf = pytest.mark.skipif(not has_pdf_deps(), reason="PDF dependencies (weasyprint/libgobject) missing") + + @pytest.fixture def sample_dataframe(): """Create a sample DataFrame for testing.""" @@ -217,6 +231,7 @@ def test_html_report_neubrutalism(self, sample_dataframe): if os.path.exists(output_file): os.remove(output_file) + @skip_pdf def test_pdf_report(self, sample_dataframe): """Test PDF report generation.""" analyzer = DatasetAnalyzer(sample_dataframe) @@ -390,6 +405,9 @@ def test_titanic_all_report_formats(self, titanic_csv): formats = ["md", "json", "html", "pdf"] for fmt in formats: + if fmt == "pdf" and not has_pdf_deps(): + continue + with tempfile.NamedTemporaryFile(mode="w", suffix=f".{fmt}", delete=False) as f: output_file = f.name diff --git a/web/src/lib/components/Hero.svelte b/web/src/lib/components/Hero.svelte index 53fd3e6..73537ce 100644 --- a/web/src/lib/components/Hero.svelte +++ b/web/src/lib/components/Hero.svelte @@ -45,7 +45,7 @@
$ pip install hashprep
-Successfully installed hashprep-0.1.0b3
+Successfully installed hashprep-0.1.0
$ hashprep scan train.csv --target Survived