diff --git a/README.md b/README.md deleted file mode 100644 index da7fda4..0000000 --- a/README.md +++ /dev/null @@ -1,62 +0,0 @@ -# MCYJ Parsing Script - -## 1. Get all the available documents from the Michigan Welfare public search API - -```bash -python pull_agency_info_api.py --output-dir metadata_output --overwrite=False --verbose -``` - -This will output the agency info and correpsonding documents to the `metadata_output` directory. -The default behavior will output all available documents in both json and csv formats. - -### 1. Output -```bash -ls metadata_output -#> 2025-10-30_agency_info.csv -#> 2025-10-30_all_agency_info.json -#> 2025-10-30_combined_pdf_content_details.csv -``` - -## 2. Get a list of extra and missing files in the downloaded files - -```r -python get_download_list.py --download-folder Downloads --available-files "metadata_output/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv" -``` - -### 2. Output -```bash -ls metadata_output -#> 2025-10-30_agency_info.csv -#> 2025-10-30_all_agency_info.json -#> 2025-10-30_combined_pdf_content_details.csv -#> extra_files.txt -#> missing_files.csv -``` - - - `extra_files.txt` contains files that are in `Downloads` but are not found from the API (most likely due to naming discrepancies) - - `missing_Files.csv` contains missing files in the csv format with header: - -``` -generated_filename,agency_name,agency_id,FileExtension,CreatedDate,Title,ContentBodyId,Id,ContentDocumentId -``` - -## 3. Download missing documents - -```bash -python download_all_pdfs.py --csv metadata_output/missing_files.csv --output-dir Downloads -``` - -### 3. Output - -```bash -$ ls downloads/ | head -# 42ND_CIRCUIT_COURT_-_FAMILY_DIVISION_42ND_CIRCUIT_COURT_-_FAMILY_DIVISION_Interim_2025_2025-07-18_069cs0000104BR0AAM.pdf -# ADOPTION_AND_FOSTER_CARE_SPECIALISTS,_INC._CB440295542_INSP_201_2020-03-14_0698z000005Hpu5AAC.pdf -# ADOPTION_AND_FOSTER_CARE_SPECIALISTS,_INC._CB440295542_ORIG.pdf_2008-06-24_0698z000005HozQAAS.pdf -# ADOPTION_ASSOCIATES,_INC_Adoption_Associates_INC_Renewal_2025_2025-08-20_069cs0000163byMAAQ.pdf -# ADOPTION_OPTION,_INC._CB560263403_ORIG.pdf_2004-05-08_0698z000005Hp18AAC.pdf -``` - -## 4. Check duplicates and update file metadata - -check the md5sums \ No newline at end of file diff --git a/download_all_pdfs.py b/download_all_pdfs.py deleted file mode 100644 index f02f731..0000000 --- a/download_all_pdfs.py +++ /dev/null @@ -1,118 +0,0 @@ -#!/usr/bin/env python3 -""" -Script to download all PDFs listed in a CSV by calling download_michigan_pdf -from `download_pdf.py` for each row. - -Expected CSV headers: -generated_filename,agency_name,agency_id,FileExtension,CreatedDate,Title,ContentBodyId,Id,ContentDocumentId - -Usage: -python download_all_pdfs.py --csv /path/to/file.csv --output-dir ./pdfs -""" -import csv -import os -import argparse -import time -from typing import Optional - -# Import functions from download_pdf.py -try: - from download_pdf import download_michigan_pdf -except Exception as e: - raise SystemExit(f"Failed to import download_michigan_pdf from download_pdf.py: {e}") - - -def process_csv(csv_path: str, output_dir: str, skip_existing: bool = True, limit: Optional[int] = None, sleep_seconds: float = 0.0): - """Read CSV and call download_michigan_pdf for each row. - - Parameters: - csv_path: path to input CSV - output_dir: directory where PDFs will be saved - skip_existing: if True and generated_filename present, skip if file exists - limit: optional max number of rows to process - """ - if not os.path.exists(csv_path): - raise FileNotFoundError(f"CSV file not found: {csv_path}") - - os.makedirs(output_dir, exist_ok=True) - - processed = 0 - failed = 0 - - with open(csv_path, newline='', encoding='utf-8') as fh: - reader = csv.DictReader(fh) - for row in reader: - if limit is not None and processed >= limit: - break - - # Extract required fields from the CSV header - gen_filename = (row.get('generated_filename') or '').strip() - agency_name = (row.get('agency_name') or '').strip() - agency_id = (row.get('agency_id') or '').strip() - file_ext = (row.get('FileExtension') or '').strip() - created_date = (row.get('CreatedDate') or '').strip() - title = (row.get('Title') or '').strip() - content_body_id = (row.get('ContentBodyId') or '').strip() - id_field = (row.get('Id') or '').strip() - content_document_id = (row.get('ContentDocumentId') or '').strip() - - # The download function needs ContentDocumentId (document_id); - # fill other args from CSV. - if not content_document_id: - print(f"Skipping row with missing ContentDocumentId: {row}") - failed += 1 - continue - - # If a generated_filename is provided, optionally skip download when file exists - if gen_filename: - target_path = os.path.join(output_dir, gen_filename) - if skip_existing and os.path.exists(target_path): - print(f"Skipping existing file: {target_path}") - processed += 1 - continue - - try: - print(f"Downloading document {content_document_id} (agency: {agency_name}, title: {title})") - out_path = download_michigan_pdf( - document_id=content_document_id, - document_agency=agency_name if agency_name else None, - document_name=title if title else None, - document_date=created_date if created_date else None, - output_dir=output_dir - ) - - if out_path: - print(f"Saved to: {out_path}") - else: - print(f"Download returned None for {content_document_id}") - failed += 1 - - except Exception as e: - print(f"Error downloading {content_document_id}: {e}") - failed += 1 - - processed += 1 - # Sleep between downloads if requested - if sleep_seconds and sleep_seconds > 0: - try: - print(f"Sleeping for {sleep_seconds} seconds...") - time.sleep(sleep_seconds) - except KeyboardInterrupt: - print("Sleep interrupted by user.") - break - - print(f"Done. Processed: {processed}. Failures: {failed}.") - return processed, failed - - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Download PDFs listed in a CSV using download_michigan_pdf from download_pdf.py') - parser.add_argument('--csv', required=True, help='Path to input CSV file') - parser.add_argument('--output-dir', required=True, help='Directory to save downloaded PDFs') - parser.add_argument('--no-skip', dest='skip_existing', action='store_false', help='Do not skip when generated_filename exists') - parser.add_argument('--limit', type=int, default=None, help='Optional max number of rows to process') - parser.add_argument('--sleep', dest='sleep_seconds', type=float, default=0.0, help='Seconds to sleep between downloads (float allowed)') - - args = parser.parse_args() - - process_csv(args.csv, args.output_dir, skip_existing=args.skip_existing, limit=args.limit, sleep_seconds=args.sleep_seconds) diff --git a/download_pdf.py b/download_pdf.py deleted file mode 100644 index d59e1a0..0000000 --- a/download_pdf.py +++ /dev/null @@ -1,193 +0,0 @@ -import requests -import base64 -import urllib3 -import os -import re -import argparse - -def get_content_base_data(document_id): - """ - POST request to fetch content base data for a given ContentDocumentId. - """ - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - # Use same base endpoint as other functions; include the query params - base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute?language=en-US&asGuest=true&htmlEncode=false" - - payload = { - "namespace": "", - "classname": "@udd/01p8z0000009E4V", - "method": "getContentBaseData", - "isContinuation": False, - "params": { - "contentDocumentId": document_id, - "actionName": "download" - }, - "cacheable": False - } - - headers = { - 'Accept': 'application/json, text/plain, */*', - 'Content-Type': 'application/json', - 'X-Requested-With': 'XMLHttpRequest', - 'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov', - 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' - } - - try: - print(f"POST getContentBaseData for ContentDocumentId={document_id}") - response = requests.post( - base_url, - json=payload, - headers=headers, - verify=False, - timeout=60 - ) - response.raise_for_status() - return response.json() - except Exception as e: - print(f"get_document_body failed for {document_id}: {e}") - if 'response' in locals(): - try: - print(f"Response content: {response.text}") - except Exception: - pass - return None - -# Note: I think we can do the same thing here using get_content_base_data -def download_michigan_pdf(document_id, document_agency=None, document_name=None, document_date=None, output_dir="./"): - """ - Download a PDF from Michigan Child Welfare Public Licensing Search - - Args: - document_id (str): The document ID (e.g., "0698z0000061FxYAAU") - document_agency (str, optional): Name of the agency for filename - document_name (str, optional): Name of the document for filename - output_dir (str): Directory to save the PDF (default: current directory) - - Returns: - str: Path to the downloaded file if successful, None if failed - """ - - # Disable SSL warnings - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - # Headers to mimic a real browser - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1' - } - - try: - # Make the request - res = get_content_base_data(document_id=document_id) - base64_data = res['returnValue'] - - # Decode the PDF content - pdf_content = base64.b64decode(base64_data) - - # Generate filename - filename = generate_filename(document_id, document_agency, document_name, document_date) - - # Ensure output directory exists - os.makedirs(output_dir, exist_ok=True) - - # Full path for the file - file_path = os.path.join(output_dir, filename) - - # Save the PDF - with open(file_path, 'wb') as f: - f.write(pdf_content) - - print(f"PDF downloaded successfully: {file_path}") - print(f"File size: {len(pdf_content)} bytes") - - return file_path - - except requests.exceptions.RequestException as e: - print(f"Error making request: {e}") - return None - except Exception as e: - print(f"Error processing PDF: {e}") - return None - -def generate_filename(document_id, document_agency, document_name, document_date): - """ - Generate a filename based on the provided parameters - - Args: - document_id (str): The document ID - document_agency (str): Agency name - document_name (str): Document name - document_date (str): Document date (not used in this version) - - Returns: - str: Generated filename - """ - # Clean up strings to be filesystem-safe - def clean_string(s): - if not s: - return "" - # Remove/replace problematic characters - s = re.sub(r'[<>:"/\\|?*]', '_', s) - # Remove extra whitespace - s = re.sub(r'\s+', '_', s) - # Remove leading/trailing underscores - s = s.strip('_') - return s - - # Build filename components - parts = [] - - if document_agency: - parts.append(clean_string(document_agency)) - - if document_name: - parts.append(clean_string(document_name)) - - if document_date: - # Ensure the date is in YYYY-MM-DD format - match = re.match(r'(\d{4})[-/](\d{2})[-/](\d{2})', str(document_date)) - # Raise error if month > 12 - if match: - if int(match.group(2)) > 12: - raise ValueError("Month in document date cannot be greater than 12") - formatted_date = f"{match.group(1)}-{match.group(2)}-{match.group(3)}" - parts.append(formatted_date) - else: - # If not in correct format, skip or handle as needed - raise ValueError("Document date must be in YYYY-MM-DD format") - - # Always include the document ID - parts.append(clean_string(document_id)) - - # Join parts with underscores - filename = '_'.join(parts) - - # Ensure it ends with .pdf - if not filename.lower().endswith('.pdf'): - filename += '.pdf' - - return filename - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download Michigan Child Welfare PDF by document ID.") - parser.add_argument("document_id", help="The document ID (e.g., 0698z0000061FxYAAU)") - parser.add_argument("--agency", dest="document_agency", help="Agency name for filename", default=None) - parser.add_argument("--name", dest="document_name", help="Document name for filename", default=None) - parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the PDF", default="./") - parser.add_argument("--date", dest="document_date", help="Document date for filename (YYYY-MM-DD)", default=None) - - args = parser.parse_args() - - download_michigan_pdf( - document_id=args.document_id, - document_agency=args.document_agency, - document_name=args.document_name, - document_date=args.document_date, - output_dir=args.output_dir - ) \ No newline at end of file diff --git a/get_download_list.py b/get_download_list.py deleted file mode 100644 index 04e020a..0000000 --- a/get_download_list.py +++ /dev/null @@ -1,105 +0,0 @@ -import os -import argparse -import csv -from datetime import datetime -import re - -def get_downloaded_files(download_folder, lower = True): - all_files = os.listdir(download_folder) - if lower: - return set(f.lower() for f in all_files) - else: - return set(all_files) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Check for missing downloaded files.") - parser.add_argument("--download-folder", help="Folder containing downloaded files") - parser.add_argument("--available-files", help="File listing expected files") - args = parser.parse_args() - - downloaded_files = get_downloaded_files(args.download_folder) - downloaded_files_no_date = {re.sub(r'_\d{4}-\d{2}-\d{2}\.pdf$', '', f) for f in downloaded_files if f.endswith('.pdf')} - expected_files = set() - expected_files_no_date = set() - expected_files_info = [] # Store complete row information - filename_to_row = {} # Map filename to complete row - - # Get the directory of the available-files for output - available_files_dir = os.path.dirname(args.available_files) - if not available_files_dir: - available_files_dir = "." # Current directory if no path specified - - # Read the expected files as csv dict - with open(args.available_files, "r") as csvfile: - reader = csv.DictReader(csvfile) - for row in reader: - #print(row) - document_agency = row.get("agency_name", "") - document_agency = document_agency.strip().replace(" ", "_").replace("/", "_") - document_name = row.get("Title", "") - document_name = document_name.strip().replace(" ", "_").replace("/", "-") - created_date = row.get("CreatedDate", "") - extension = row.get("FileExtension", "pdf") - #print(f"Processing: {document_agency}, {document_name}, {created_date}, {extension}") - # datetime.strptime(sanitized_date, '%m-%d-%Y').date() - document_date = datetime.strptime(created_date, "%Y-%m-%dT%H:%M:%S.%fZ").date() - filename_with_date = f"{document_agency}_{document_name}_{document_date}.{extension}".lower() - filename_no_date = f"{document_agency}_{document_name}".lower() - - expected_files.add(filename_with_date) - expected_files_no_date.add(filename_no_date) - expected_files_info.append(row) - filename_to_row[filename_no_date] = row - - - # If the files have the same agency and name, but the date is different, we consider it a different file - - extra_files = downloaded_files_no_date - expected_files_no_date - missing_files = expected_files_no_date - downloaded_files_no_date - -# print(list(downloaded_files)[:5]) - - # Print intersection - print("Files in both downloaded and expected:") - for f in sorted(downloaded_files & expected_files): - print(f) - - print("Files in downloaded but not expected:") - # Write download files to a file - extra_files_path = os.path.join(available_files_dir, "extra_files.txt") - with open(extra_files_path, "w") as f: - for file in sorted(extra_files): - f.write(file + "\n") - - # Write expected files to a file using CSV writer - missing_files_path = os.path.join(available_files_dir, "missing_files.csv") - with open(missing_files_path, "w", newline='') as f: - if missing_files and expected_files_info: - # Get the first row to extract headers - headers = list(expected_files_info[0].keys()) - writer = csv.writer(f) - - # Write header row with filename as additional column - writer.writerow(["generated_filename"] + headers) - - # Write data rows for missing files - for file in sorted(missing_files): - if file in filename_to_row: - row = filename_to_row[file] - row_values = [row.get(header, "") for header in headers] - # Add file extension to the generated filename - file_extension = row.get("FileExtension", "pdf") - filename_with_extension = f"{file}.{file_extension}" - writer.writerow([filename_with_extension] + row_values) - else: - # If no row data available, write just the filename - writer.writerow([file] + [""] * len(headers)) - else: - # Fallback to just filenames if no row data available - writer = csv.writer(f) - writer.writerow(["generated_filename"]) # Simple header - for file in sorted(missing_files): - writer.writerow([file]) - - print(len(missing_files), "missing files found.") - print(len(extra_files), "extra files found.") \ No newline at end of file diff --git a/ingestion/README.md b/ingestion/README.md new file mode 100644 index 0000000..63500bd --- /dev/null +++ b/ingestion/README.md @@ -0,0 +1,107 @@ +# MCYJ Primary Document Ingestion + +This directory contains scripts for downloading the current set of documents available from the [Michigan Welfare Licensing Search](https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/). The pipeline uses two separate directories: + +1. **Download directory**: Persistent storage for PDF documents (accumulates over time) +2. **Run directory**: Run-specific metadata, logs, and artifacts for each round of ingestion + +The scripts are idempotent - running them multiple times will only download new documents. + +## Quick Start + +The ingestion process consists of two simple steps: + +### Step 1: Get Agency Metadata + +Pull all available documents from the Michigan API: + +```bash +python pull_agency_info_api.py --run-dir run_2025-11-03 --overwrite=False --verbose +``` + +**Output** (in `run_2025-11-03/`): +- `YYYY-MM-DD_agency_info.csv` +- `YYYY-MM-DD_all_agency_info.json` +- `YYYY-MM-DD_combined_pdf_content_details.csv` + +### Step 2: Download Documents + +Download PDFs directly from the content details CSV: + +```bash +python download_all_pdfs.py --csv "run_2025-11-03/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv" --download-dir Downloads +``` + +The download script automatically: +- Detects existing files by ContentDocumentId pattern (`*_{ContentDocumentId}.pdf`) +- Skips files that already exist (unless `--no-skip` is used) +- Verifies content integrity using SHA256 hashes +- Handles files with different agency names (agency renames, etc.) + +**Options:** +- `--no-skip`: Re-download existing files (default: skip existing files) +- `--limit N`: Download at most N files (useful for testing) +- `--sleep SECONDS`: Delay between downloads (default: 0.1) + +### Directory Structure + +After running, you'll have two separate directories: + +**Download directory** (`Downloads/`): +- PDF files named: `AGENCY_NAME_DOCUMENT_TITLE_YYYY-MM-DD.pdf` +- This directory persists and accumulates PDFs across multiple runs + +**Run directory** (`run_2025-11-03/`): +- `YYYY-MM-DD_agency_info.csv` - Agency information +- `YYYY-MM-DD_all_agency_info.json` - Complete API response +- `YYYY-MM-DD_combined_pdf_content_details.csv` - All available documents +- This directory is specific to each run and contains all metadata/logs + +## Example: Test Run + +To test with only 10 downloads: + +```bash +# Step 1: Get metadata +python pull_agency_info_api.py --run-dir run_test --verbose + +# Step 2: Download with limit +python download_all_pdfs.py --csv "run_test/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv" --download-dir Downloads --limit 10 +``` + +## Re-running the Pipeline + +The ingestion pipeline is designed to be run repeatedly: + +1. **Daily updates**: Run the two-step process daily with a new run directory to pick up new documents +2. **Incremental downloads**: Only new/missing documents are downloaded +3. **Metadata refresh**: Use `--overwrite=True` with pull_agency_info_api.py to force metadata refresh + +```bash +# Daily cron job example - creates a new run directory each day +# Step 1: Get metadata +0 2 * * * cd /path/to/ingestion && python pull_agency_info_api.py --run-dir "run_$(date +\%Y-\%m-\%d)" +# Step 2: Download files +5 2 * * * cd /path/to/ingestion && python download_all_pdfs.py --csv "run_$(date +\%Y-\%m-\%d)/$(date +\%Y-\%m-\%d)_combined_pdf_content_details.csv" --download-dir Downloads +``` + +### Directory Best Practices + +- **Download directory**: Single persistent directory (e.g., `Downloads/`) that accumulates all PDFs +- **Run directories**: Create a new one for each execution (e.g., `run_2025-11-03`, `run_2025-11-04`) +- Keep run directories for audit trails and historical metadata + +## Known Limitations + +The Michigan API has an important limitation regarding document versioning: + +- **The API only exposes the latest version of each document** +- Historical versions of documents are NOT accessible through the API +- Each document has a unique `ContentDocumentId` (069...) used for downloading +- Documents also have a `ContentVersionId` (068...) that changes when updated +- However, the API does NOT support downloading by `ContentVersionId` +- When a document is updated, the old version becomes inaccessible + +However, we believe that, in fact, the way Michigan is using their platform, there are *never* multiple versions. In fact, we sometimes see the opposite: several identical files with different ContentDocumentId. So for now we are operating as if ContentDocumentId is sufficient. If we ever come across a situation where the pdf associated with a ContentDocumentId changes, we will adjust accordingly then. + +**File Naming**: Files are named `{agency_name}_{ContentDocumentId}.pdf` where ContentDocumentId uniquely identifies each document (but only its latest version is downloadable). \ No newline at end of file diff --git a/ingestion/download_all_pdfs.py b/ingestion/download_all_pdfs.py new file mode 100644 index 0000000..86bfbe0 --- /dev/null +++ b/ingestion/download_all_pdfs.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python3 +""" +Batch PDF Downloader - Downloads PDFs from Content Details CSV + +This script downloads PDFs listed in a CSV file (such as combined_pdf_content_details.csv) +by calling download_michigan_pdf from download_pdf.py for each row. It handles batch +downloads with progress tracking, error handling, and rate limiting. + +The script performs these operations: +1. Reads a CSV file containing document metadata +2. For each document, checks if it already exists (by ContentDocumentId pattern) +3. Downloads missing PDFs from the Michigan API +4. Skips files that already exist (configurable with --no-skip) +5. Applies rate limiting between downloads +6. Reports progress and failures + +Expected CSV headers: + ContentDocumentId,agency_name (required) + Other columns are ignored but can be present + +File Naming Convention: + Files are named using ContentDocumentId as the unique identifier. + Format: {agency_name}_{ContentDocumentId}.pdf + Example: glens_house_0698z0000061FxYAAU.pdf + + Note: The API only exposes the latest version of each document. Historical versions + are not accessible. + +Usage: + python download_all_pdfs.py --csv combined_pdf_content_details.csv --download-dir Downloads [--limit 100] [--sleep 0.5] + +Options: + --csv: Path to CSV file with document metadata (e.g., combined_pdf_content_details.csv) + --download-dir: Directory to save downloaded PDFs + --no-skip: Re-download files even if they exist + --limit: Maximum number of files to download (for testing) + --sleep: Seconds to sleep between downloads (default: 0.1) + +Author: STATCOM MCYJ project +""" +import csv +import os +import argparse +import time +import logging +from typing import Optional + +# Set up logger +logger = logging.getLogger(__name__) + +# Import functions from download_pdf.py +from download_pdf import download_michigan_pdf, generate_filename +import glob + +def process_csv(csv_path: str, output_dir: str, skip_existing: bool = True, limit: Optional[int] = None, sleep_seconds: Optional[float] = 0.1): + """Read CSV and call download_michigan_pdf for each row. + + Parameters: + csv_path: path to input CSV + output_dir: directory where PDFs will be saved + skip_existing: if True and file with matching ContentDocumentId + is present, skip if file exists. If False, download and verify + that sha256 matches existing file (we will throw an exception and + abort entire run if sha256 does not match, as this would indicates data + inconsistency that requires investigation). + limit: optional max number of rows to process + sleep_seconds: seconds to sleep between downloads to respect + rate limiting and avoid server overload. + """ + + if not os.path.exists(csv_path): + raise FileNotFoundError(f"CSV file not found: {csv_path}") + + os.makedirs(output_dir, exist_ok=True) + + processed = 0 + failed = 0 + + with open(csv_path, newline='', encoding='utf-8') as fh: + reader = csv.DictReader(fh) + for row in reader: + if limit is not None and processed >= limit: + break + + # Extract required fields from the CSV header + content_document_id = (row.get('ContentDocumentId') or '').strip() + agency_name = (row.get('agency_name') or '').strip() + + # Validate required fields + if not content_document_id: + logger.warning(f"Skipping row with missing ContentDocumentId: {row}") + failed += 1 + continue + + # Use glob to find existing files with pattern *_ContentDocumentId.pdf + pattern = os.path.join(output_dir, f"*_{content_document_id}.pdf") + existing_files = glob.glob(pattern) + + # if there is exactly one existing file + # we consider that the filename to use + # (and possibly skip download) + if len(existing_files)==1: + filename = os.path.basename(existing_files[0]) + if skip_existing: + logger.info(f"Skipping existing file: {filename}") + processed += 1 + continue + elif len(existing_files) > 1: + raise ValueError(f"Multiple existing files found for ContentDocumentId={content_document_id}: {existing_files}") + else: + filename = generate_filename(content_document_id, agency_name) + + try: + logger.info(f"Downloading document {content_document_id} (agency: {agency_name})") + file_path = os.path.join(output_dir, filename) + out_path = download_michigan_pdf( + document_id=content_document_id, + file_path=file_path + ) + logger.info(f"Saved to: {out_path}") + + except Exception as e: + logger.error(f"Error downloading {content_document_id}: {e}") + failed += 1 + + processed += 1 + # Sleep between downloads if requested + if sleep_seconds and sleep_seconds > 0: + try: + logger.debug(f"Sleeping for {sleep_seconds} seconds...") + time.sleep(sleep_seconds) + except KeyboardInterrupt: + logger.info("Sleep interrupted by user.") + break + + logger.info(f"Done. Processed: {processed}. Failures: {failed}.") + return processed, failed + + +if __name__ == '__main__': + # Set up logging for script use + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + parser = argparse.ArgumentParser(description='Download PDFs from content details CSV using download_michigan_pdf') + parser.add_argument('--csv', required=True, help='Path to content details CSV file (e.g., combined_pdf_content_details.csv)') + parser.add_argument('--download-dir', required=True, help='Directory to save downloaded PDFs') + parser.add_argument('--no-skip', dest='skip_existing', action='store_false', help='Do not skip when generated_filename exists') + parser.add_argument('--limit', type=int, default=None, help='Optional max number of rows to process') + parser.add_argument('--sleep', dest='sleep_seconds', type=float, default=0.1, help='Seconds to sleep between downloads (float allowed)') + + args = parser.parse_args() + + process_csv(args.csv, args.download_dir, skip_existing=args.skip_existing, limit=args.limit, sleep_seconds=args.sleep_seconds) diff --git a/ingestion/download_pdf.py b/ingestion/download_pdf.py new file mode 100644 index 0000000..5eb5eb9 --- /dev/null +++ b/ingestion/download_pdf.py @@ -0,0 +1,263 @@ +""" +Single PDF Downloader - Core Download Function + +This script provides the core functionality for downloading a single PDF document +from the Michigan Child Welfare Public Licensing Search system. It can be used +as a standalone script or imported by other scripts (like download_all_pdfs.py). + +The script performs these operations: +1. Fetches document content from the Michigan API using ContentDocumentId +2. Decodes the base64-encoded PDF data +3. Generates a standardized filename from agency, document name, and date +4. Saves the PDF to the specified download directory + +Usage as standalone: + python download_pdf.py --csv run_2025-11-03/combined_pdf_content_details.csv --download-dir Downloads + +Usage as module: + from download_pdf import download_michigan_pdf + + # Download with explicit file path + download_michigan_pdf(document_id="0698z0000061FxYAAU", file_path="Downloads/my_document.pdf") + +Arguments (standalone): + document_id: The ContentDocumentId from the API (required) + --csv: CSV file to lookup agency name (required) + --download-dir: Directory to save the PDF (default: current directory) + +Note: If a file already exists, the script will download the content and compare SHA256 +hashes. If they match, it succeeds without overwriting. If they differ, it raises an error. + +Author: STATCOM MCYJ project +""" + +import requests +import base64 +import urllib3 +import os +import re +import argparse +import logging +import hashlib +from io import BytesIO + +# Set up logger +logger = logging.getLogger(__name__) + +def get_content_base_data(document_id): + """ + POST request to fetch content base data for a given ContentDocumentId. + + Args: + document_id (str): ContentDocumentId (069...) + + Returns: + dict: JSON response with base64-encoded PDF data + None: If request fails + + Note: + The API only supports downloading by ContentDocumentId, which always returns + the latest version. ContentVersionId cannot be used for downloading. + """ + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + # Use same base endpoint as other functions; include the query params + base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute?language=en-US&asGuest=true&htmlEncode=false" + + params_dict = { + "contentDocumentId": document_id, + "actionName": "download" + } + + payload = { + "namespace": "", + "classname": "@udd/01p8z0000009E4V", + "method": "getContentBaseData", + "isContinuation": False, + "params": params_dict, + "cacheable": False + } + + headers = { + 'Accept': 'application/json, text/plain, */*', + 'Content-Type': 'application/json', + 'X-Requested-With': 'XMLHttpRequest', + 'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov', + 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' + } + + logger.info(f"POST getContentBaseData for ContentDocumentId={document_id}") + response = requests.post( + base_url, + json=payload, + headers=headers, + verify=False, + timeout=60 + ) + response.raise_for_status() + return response.json() + +def download_michigan_pdf(document_id, file_path): + """ + Download a PDF from Michigan Child Welfare Public Licensing Search + + Args: + document_id (str): The ContentDocumentId (e.g., "0698z0000061FxYAAU") + file_path (str): Where to download the PDF file + + Returns: + str: Path to the downloaded file if successful, None if failed + + Raises: + ValueError: If API request fails or if existing file has different content (SHA256 mismatch) + + Note: + Downloads using ContentDocumentId, which always returns the latest version. + The API does not support downloading historical versions. + If file exists, compares SHA256 hash - succeeds if identical, raises ValueError if different. + """ + + # Disable SSL warnings + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + logger.info(f"Downloading document: {document_id}") + res = get_content_base_data(document_id=document_id) + + if not res or 'returnValue' not in res: + raise ValueError(f"Failed to get PDF data from API for document ID: {document_id}") + + base64_data = res['returnValue'] + + # Decode the PDF content + pdf_content = base64.b64decode(base64_data) + + # Calculate SHA256 of the new content + new_content_hash = hashlib.sha256(pdf_content).hexdigest() + + # Check if file exists and compare hashes + if os.path.exists(file_path): + logger.info(f"File already exists: {file_path}") + logger.info("Comparing SHA256 hashes...") + + # Read existing file and calculate its hash + with open(file_path, 'rb') as f: + existing_content = f.read() + existing_content_hash = hashlib.sha256(existing_content).hexdigest() + + if new_content_hash == existing_content_hash: + logger.info(f"SHA256 hashes match - file is identical") + logger.info(f"Downloaded {len(pdf_content)} bytes and verified existing file") + return file_path + else: + raise ValueError( + f"File exists but content differs (SHA256 mismatch):\n" + f" Existing: {existing_content_hash}\n" + f" New: {new_content_hash}\n" + f" File: {file_path}" + ) + + # Save the PDF (only if file doesn't exist or hashes match) + with open(file_path, 'wb') as f: + f.write(pdf_content) + + logger.info(f"Downloaded {len(pdf_content)} bytes") + + return file_path + +def generate_filename(document_id, document_agency): + """ + Generate a filename based on ContentDocumentId and agency name. + + Args: + document_id (str): The ContentDocumentId (e.g., "0698z0000061FxYAAU") + document_agency (str): Agency name + + Returns: + str: Generated filename in format: {agency_name}_{document_id}.pdf + + Example: + generate_filename("0698z0000061FxYAAU", "Glen's House") + -> "glens_house_0698z0000061FxYAAU.pdf" + """ + # Clean up agency name to be filesystem-safe + def clean_string(s): + if not s: + return "" + # Remove/replace problematic characters + s = re.sub(r'[<>:"/\\|?*]', '_', s) + # Replace spaces and special chars with underscores + s = re.sub(r'\s+', '_', s) + # Remove apostrophes and quotes + s = s.replace("'", "").replace('"', '') + # Remove leading/trailing underscores + s = s.strip('_') + return s.lower() + + # Build filename: {agency}_{document_id}.pdf + agency_clean = clean_string(document_agency) if document_agency else "unknown" + + # If no document_id provided, use placeholder + if not document_id: + import time + document_id = f"unknown_{int(time.time())}" + + # Format: {agency_name}_{document_id}.pdf + filename = f"{agency_clean}_{document_id}.pdf" + + return filename + +if __name__ == "__main__": + # Set up logging for script use + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' + ) + + parser = argparse.ArgumentParser( + description="Download Michigan Child Welfare PDF by document ID." + ) + parser.add_argument("document_id", help="The document ID (ContentDocumentId, e.g., 0698z0000061FxYAAU)") + parser.add_argument("--csv", required=True, help="CSV file to lookup agency name (e.g., combined_pdf_content_details.csv)") + parser.add_argument("--download-dir", dest="output_dir", help="Directory to save the PDF", default="./") + + args = parser.parse_args() + + # Look up agency name from CSV + import csv as csv_module + agency_name = None + with open(args.csv, 'r', encoding='utf-8') as f: + reader = csv_module.DictReader(f) + for row in reader: + if row.get('ContentDocumentId', '').strip() == args.document_id: + agency_name = row.get('agency_name', '').strip() + break + + if not agency_name: + logger.error(f"Could not find ContentDocumentId={args.document_id} in {args.csv}") + logger.error("Make sure the CSV has 'ContentDocumentId' and 'agency_name' columns") + exit(1) + + logger.info(f"Found agency in CSV: {agency_name}") + + # Ensure output directory exists + os.makedirs(args.output_dir, exist_ok=True) + + # Generate filename and full file path + filename = generate_filename(args.document_id, agency_name) + file_path = os.path.join(args.output_dir, filename) + + try: + result = download_michigan_pdf( + document_id=args.document_id, + file_path=file_path + ) + except Exception as e: + logger.error(f"Download failed: {e}") + exit(1) + + if result: + logger.info(f"Success! File saved to: {result}") + else: + logger.error("Download failed") + exit(1) \ No newline at end of file diff --git a/ingestion/pull_agency_info_api.py b/ingestion/pull_agency_info_api.py new file mode 100644 index 0000000..8d3f082 --- /dev/null +++ b/ingestion/pull_agency_info_api.py @@ -0,0 +1,384 @@ +""" +Michigan Child Welfare Agency Metadata Retrieval + +This script pulls agency information and associated document metadata from the +Michigan Child Welfare Public Licensing Search API. It fetches data for all +agencies and their available documents, saving the results to a run directory. + +The script performs these operations: +1. Fetches all agency information from the API +2. For each agency, retrieves associated document metadata +3. Saves all agency info and document details as JSON, + and most of the key information is also stored as CSV +4. Merges all document metadata into a single combined CSV file + +API Functions Overview: + Two main functions interact with the Michigan API: + + 1. get_all_agency_info() + - Fetches list of ALL agencies with basic info (name, address, license) + - Called ONCE at start of pipeline + - API method: 'getAgenciesDetail' with recordId=None + + 2. get_content_details_method(record_id) + - Fetches DOCUMENT listings for ONE specific agency + - Called FOR EACH agency to get their PDF metadata + - API method: 'getContentDetails' with specific recordId + - Returns list of PDFs with ContentDocumentId needed for download + + Key Distinction: + - getAgenciesDetail: Returns AGENCY information (who they are) + - getContentDetails: Returns DOCUMENT listings (what files they have) + +Usage: + python pull_agency_info_api.py --run-dir run_2025-11-03 [--verbose] + +Output (in run directory): + - YYYY-MM-DD_all_agency_info.json: Complete API response with all agencies + - YYYY-MM-DD_agency_info.csv: Agency information in CSV format + - YYYY-MM-DD_combined_pdf_content_details.csv: All documents from all agencies + - Individual JSON/CSV files per agency (removed after merging by default) + +Author: STATCOM MCYJ project +""" + +import csv +import requests +import json +import urllib.parse +import urllib3 +import argparse +import os +import logging +from datetime import datetime + +# Set up logger +logger = logging.getLogger(__name__) + +def get_all_agency_info(): + """ + Fetches basic information for ALL agencies from the Michigan API. + + This function calls the 'getAgenciesDetail' API method with recordId=None to + retrieve a list of all child welfare agencies in Michigan, including basic + information like agency name, address, license status, etc. + + Returns: + dict: JSON response containing a list of all agencies with their basic info + None: If the request fails + + Note: This returns agency metadata ONLY, not the documents/PDFs associated + with each agency. Use get_content_details_method() to fetch documents for + a specific agency. + """ + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute" + + params = { + "cacheable": "true", + "classname": "@udd/01p8z0000009E4V", + "isContinuation": "false", + "method": "getAgenciesDetail", + "namespace": "", + "params": json.dumps({"recordId": None}), + "language": "en-US", + "asGuest": "true", + "htmlEncode": "false" + } + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Accept-Language': 'en-US,en;q=0.9', + 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' + } + + try: + logger.info("GET request with recordId=null") + response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30) + response.raise_for_status() + return response.json() + except Exception as e: + logger.error(f"GET request with recordId=null failed: {e}") + return None + +def get_content_details_method(record_id): + """ + Fetches document/PDF metadata for a SPECIFIC agency by its record ID. + + This is the function used to get the list of documents associated with a + specific agency. + + This function calls the 'getContentDetails' API method with a specific + recordId to get a list of all documents (PDFs) associated with that agency, + including metadata like: + - Title: Document name/title + - CreatedDate: When the document was created + - FileExtension: Usually 'pdf' + - ContentDocumentId: Unique ID needed to download the actual PDF file + - ContentBodyId: Internal reference ID + + Args: + record_id (str): The agency's unique identifier (agencyId) + + Returns: + dict: JSON response containing list of documents in 'contentVersionRes' key + None: If the request fails + + """ + urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + + base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute" + + # JSON payload + payload = { + "namespace": "", + "classname": "@udd/01p8z0000009E4V", + "method": "getContentDetails", + "isContinuation": False, + "params": { + "recordId": record_id + }, + "cacheable": False + } + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', + 'Accept': 'application/json, text/plain, */*', + 'Content-Type': 'application/json', + 'X-Requested-With': 'XMLHttpRequest', + 'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov', + 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' + } + + logger.info("POST with JSON payload directly to the API endpoint") + logger.debug(f"Payload: {json.dumps(payload, indent=2)}") + + response = requests.post( + base_url, + json=payload, + headers=headers, + verify=False, + timeout=30 + ) + response.raise_for_status() + return response.json() + + +def merge_agency_info(agency_csv, run_dir = ".", remove_files=False): + """ + Merges the agency details into the all agency info dictionary. + """ + date_str = datetime.now().strftime("%Y-%m-%d") + + # Build a mapping from agencyId to AgencyName + agency_names = {} + with open(agency_csv, mode='r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + agency_id = row.get('agencyId') + agency_name = row.get('AgencyName') + if agency_id and agency_name: + agency_names[agency_id] = agency_name + + # Merge PDF content details for each agency + + combined_rows = [] + header = [] + for agency_id, agency_name in agency_names.items(): + pdf_csv = os.path.join(run_dir, f"{agency_id}_pdf_content_details.csv") + if os.path.exists(pdf_csv): + with open(pdf_csv, mode='r', encoding='utf-8') as f: + reader = csv.reader(f) + header = next(reader) + header = ['agency_name'] + header + for row in reader: + combined_rows.append([agency_name] + row) + else: + logger.warning(f"PDF content details CSV not found for agency ID {agency_id}, skipping...") + continue + + # Write out the combined CSV + combined_csv = os.path.join(run_dir, f"{date_str}_combined_pdf_content_details.csv") + with open(combined_csv, mode='w', newline='', encoding='utf-8') as f: + writer = csv.writer(f, quoting=csv.QUOTE_ALL) + # Write header: agency_id + original header + writer.writerow(header) + writer.writerows(combined_rows) + + logger.info(f"Combined PDF content details written to {combined_csv}") + # If remove files then remove each file + if remove_files: + for agency_id, agency_name in agency_names.items(): + pdf_csv = os.path.join(run_dir, f"{agency_id}_pdf_content_details.csv") + json_path = os.path.join(run_dir, f"{agency_id}_pdf_content_details.json") + if os.path.exists(pdf_csv): + os.remove(pdf_csv) + logger.debug(f"Removed file: {pdf_csv}") + if os.path.exists(json_path): + os.remove(json_path) + logger.debug(f"Removed file: {json_path}") + return combined_csv + + +def pull_all_agency_metadata(run_dir, overwrite=False, remove_files=True, verbose=False): + """ + Main function to pull all agency metadata and document listings from Michigan API. + + This function orchestrates the complete metadata retrieval process: + 1. Fetches all agency information + 2. Saves agency info to JSON and CSV + 3. For each agency, fetches document metadata + 4. Merges all document metadata into a combined CSV + + Args: + run_dir (str): Directory to save all metadata files + overwrite (bool): If False, skip agencies with existing CSV files (default: False) + remove_files (bool): Remove individual agency files after merging (default: True) + verbose (bool): Enable verbose output (default: False) + + Returns: + str: Path to the combined CSV file containing all document metadata + + Raises: + RuntimeError: If unable to fetch agency information from API + """ + os.makedirs(run_dir, exist_ok=True) + + # Step 1: Fetch all agency information + all_agency_info = get_all_agency_info() + if not all_agency_info: + raise RuntimeError("Failed to fetch agency information from API") + + if verbose: + logger.debug(json.dumps(all_agency_info, indent=2)) + + date_str = datetime.now().strftime("%Y-%m-%d") + + # Save complete agency info JSON + agency_file = os.path.join(run_dir, f"{date_str}_all_agency_info.json") + with open(agency_file, "w", encoding="utf-8") as f: + json.dump(all_agency_info, f, indent=2, ensure_ascii=False) + logger.info(f"All agency information saved to {agency_file}") + + # Extract agency list + agency_list = ( + all_agency_info.get('returnValue', {}) + .get('objectData', {}) + .get('responseResult', []) + ) + + # Step 2: Save agency info as CSV + agency_keep_cols = [ + "Address", "agencyId", "AgencyName", "AgencyType", "City", "County", + "LicenseEffectiveDate", "LicenseeGroupOrganizationName", + "LicenseExpirationDate", "LicenseNumber", "LicenseStatus", "Phone", "ZipCode" + ] + + agency_csv_file = os.path.join(run_dir, f"{date_str}_agency_info.csv") + with open(agency_csv_file, mode='w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=agency_keep_cols, quoting=csv.QUOTE_ALL) + writer.writeheader() + for agency in agency_list: + row = {col: agency.get(col, "") for col in agency_keep_cols} + writer.writerow(row) + logger.info(f"Agency info written to {agency_csv_file}") + + # Step 3: Fetch document metadata for each agency + doc_keep_cols = ['FileExtension', 'CreatedDate', 'Title', 'ContentBodyId', 'Id', 'ContentDocumentId'] + + for agency in agency_list: + record_id = agency.get('agencyId') + if not record_id: + if verbose: + logger.debug(f"Skipping agency with empty ID") + continue + + csv_file = os.path.join(run_dir, f"{record_id}_pdf_content_details.csv") + + # Skip if file exists and not overwriting + if not overwrite and os.path.exists(csv_file): + if verbose: + logger.debug(f"Skipping {record_id} (file exists, overwrite=False)") + continue + + if verbose: + logger.info(f"Processing agency ID: {record_id}") + + pdf_results = get_content_details_method(record_id) + + if pdf_results: + # Save full JSON + json_file = os.path.join(run_dir, f"{record_id}_pdf_content_details.json") + with open(json_file, "w", encoding="utf-8") as jf: + json.dump(pdf_results, jf, indent=2, ensure_ascii=False) + + # Save CSV with key fields + with open(csv_file, mode='w', newline='', encoding='utf-8') as f: + writer = csv.writer(f, quoting=csv.QUOTE_ALL) + writer.writerow(['agency_id'] + doc_keep_cols) + for p in pdf_results.get('returnValue', {}).get('contentVersionRes', []): + row_data = [record_id] + [p.get(k, "") for k in doc_keep_cols] + writer.writerow(row_data) + + if verbose: + logger.debug(f" Saved document metadata for {record_id}") + else: + logger.warning(f"Failed to retrieve document details for agency ID: {record_id}") + + # Step 4: Merge all document metadata into single CSV + combined_csv = merge_agency_info(agency_csv_file, run_dir, remove_files=remove_files) + return combined_csv + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Pull agency metadata and document listings from Michigan Child Welfare API" + ) + parser.add_argument( + "--run-dir", + dest="run_dir", + help="Directory for this run's metadata and artifacts", + default="./" + ) + parser.add_argument( + "--overwrite", + dest="overwrite", + action="store_true", + help="Overwrite existing files (default: False)" + ) + parser.add_argument( + "--remove-files", + dest="remove_files", + action="store_true", + default=True, + help="Remove individual agency files after merging (default: True)" + ) + parser.add_argument( + "--verbose", + action="store_true", + help="Enable verbose output" + ) + + args = parser.parse_args() + + # Configure logging + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + try: + combined_csv = pull_all_agency_metadata( + run_dir=args.run_dir, + overwrite=args.overwrite, + remove_files=args.remove_files, + verbose=args.verbose + ) + logger.info(f"Success! Combined metadata saved to: {combined_csv}") + except Exception as e: + logger.error(f"Error: {e}") + import traceback + traceback.print_exc() + exit(1) \ No newline at end of file diff --git a/mcyj_download.py b/mcyj_download.py deleted file mode 100644 index e02de94..0000000 --- a/mcyj_download.py +++ /dev/null @@ -1,80 +0,0 @@ -from datetime import datetime -import os -import re -import argparse -import csv - -def file_info_to_filename(agency_id, document_name, document_date): - # Convert file information dictionary to a filename string - - document_agency = agency_id.strip().replace(" ", "_").replace("/", "_") - document_name = document_name.strip().replace(" ", "_").replace("/", "-") - - return f"{document_agency}_{document_name}_{document_date}.pdf" - -def get_output_dir_info(output_dir): - """ - Get the list of files in the output directory and the latest date from filenames. - - Args: - output_dir (str): Directory to check for existing files.""" - - existing_files = os.listdir(output_dir) - if existing_files: - pdf_files = [f for f in existing_files if re.match(r'.*\d{4}-\d{2}-\d{2}\.pdf$', f)] - # Extract date from filename using regex and find the most recent date - all_dates = [] - for f in pdf_files: - match = re.search(r'(\d{4}-\d{2}-\d{2})\.pdf$', f) - if match: - all_dates.append(match.group(1)) - # Get the latest date from the list, parsing as YYYY-MM-DD date - all_dates = [datetime.strptime(date, '%Y-%m-%d') for date in all_dates] - if all_dates: - latest_date = max(all_dates).strftime('%Y-%m-%d') - print(f"Latest date found in existing files: {latest_date}") - else: - latest_date = None - - return existing_files, latest_date - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.") - parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV and JSON files", default="./") - parser.add_argument("--input-file", dest="input_file", help="Path to the input CSV file") - args = parser.parse_args() - output_dir = args.output_dir - input_file = args.input_file - - # Read all of the files in the output directory and get the latest date - if not os.path.exists(output_dir): - print(f"Output directory {output_dir} does not exist. Creating it.") - os.makedirs(output_dir) - existing_files, latest_date = get_output_dir_info(output_dir) - - # Read in the input file csv as dictionary - if input_file: - if os.path.exists(input_file): - with open(input_file, 'r') as f: - reader = csv.DictReader(f) - input_data = [row for row in reader] - else: - raise FileNotFoundError(f"Input file {input_file} does not exist.") - - for row in input_data: - # Parse document_date in "2023-08-22T15:30:32.000Z" format to "YYYY-MM-DD" - raw_date = row.get('CreatedDate', '') - try: - parsed_date = datetime.strptime(raw_date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y-%m-%d") - except (ValueError, TypeError): - parsed_date = raw_date # fallback if parsing fails - - print(file_info_to_filename( - agency_id=row.get('agency_id', ''), - document_name=row.get('Title', ''), - document_date=parsed_date - )) -# print(input_data) - print(existing_files[:5]) - print(latest_date) \ No newline at end of file diff --git a/parse_available_files.py b/parse_available_files.py deleted file mode 100644 index 4815f74..0000000 --- a/parse_available_files.py +++ /dev/null @@ -1,121 +0,0 @@ -# Selenium is used for web automation and scraping -from selenium import webdriver -from selenium.webdriver.chrome.service import Service -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.webdriver.support import expected_conditions as EC -from selenium.webdriver.common.keys import Keys -from selenium.common.exceptions import ElementClickInterceptedException -# Time utilities for delays and timestamp handling -import argparse -import time -from time import sleep -# Date and time parsing -from datetime import datetime -# File and path operations -from pathlib import Path -import os -import glob - -import csv -import re - -## First get the agency URLs (and ids) -#def get_agency_information(): -def get_agency_information(driver): - """ - Extracts agency information from a web page using Selenium. - - Args: - driver (webdriver): The Selenium WebDriver instance. - - Returns: - list: A list of dictionaries containing agency information. - """ - driver.get("https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/") - time.sleep(5) # Wait for the page to fully load (longer than usual due to dynamic content) - # Prepare an empty list to store all agency-specific URLs found across pages - - sub_urls = [] - table_header = [] - table_data = [] - header_elements = driver.find_elements(By.XPATH, "//lightning-datatable//table/thead/tr/th") - # Clean up header text by removing "Sort by:" and "Sorted: None" - for i, header in enumerate(header_elements): - text = header.text.strip() - text = text.replace("Sort by:", "").replace("Sorted: None", "").replace("\n", " ").strip() - table_header.append(text) - - while True: - # Find the license number rows: - # Change this to tr OR th - table_rows = driver.find_elements(By.XPATH, "//lightning-datatable//table/tbody/tr") - for row in table_rows: - # Find all columns (td elements) in the row - row_data = [] - columns = row.find_elements(By.XPATH, "./td | ./th") - for col in columns: - row_data.append(col.text.strip()) - table_data.append(row_data) - - for row in table_rows: - link_elements = row.find_elements(By.XPATH, ".//lightning-formatted-url/a") - for link in link_elements: - href = link.get_attribute('href') - if href: - sub_urls.append(href) - - try: - # Try to locate and click the "Next" page button to load the next page of results - next_button = driver.find_element(By.XPATH, "//lightning-button-icon[3]/button/lightning-primitive-icon") - next_button.click() - except ElementClickInterceptedException: - # If the click fails (e.g., no more pages or overlay blocking it), stop the loop - print("No more pages available.") - break - return sub_urls, table_header, table_data - -def write_agency_information_to_csv(sub_urls, table_header, table_data, output_dir): - """ - Writes the agency information to a CSV file. - - Args: - sub_urls (list): List of agency URLs. - table_header (list): List of table headers. - table_data (list): List of table data rows. - output_dir (str): Directory where the CSV file will be saved. - """ - # Ensure output directory exists - os.makedirs(output_dir, exist_ok=True) - - # Define the output CSV file path - # Add date to the filename to avoid overwriting - date_str = datetime.now().strftime("%Y%m%d") - csv_filename = f"agency_information_{date_str}.csv" - csv_file_path = os.path.join(output_dir, csv_filename) - - # Write to CSV file - with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile: - writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL) - writer.writerow(table_header) # Write header - writer.writerows(table_data) # Write data rows - - print(f"Agency information written to {csv_file_path}") - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.") - parser.add_argument("--driver-path", dest="driver_path", help="Path to the ChromeDriver executable", default=None) - parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV file", default="./") - - args = parser.parse_args() - - # Initialize the WebDriver - service = Service(args.driver_path) - driver = webdriver.Chrome(service=service) - - try: - sub_urls, table_header, table_data = get_agency_information(driver) - write_agency_information_to_csv(sub_urls, table_header, table_data, args.output_dir) - finally: - driver.quit() # Ensure the driver is closed after use \ No newline at end of file diff --git a/pull_agency_info_api.py b/pull_agency_info_api.py deleted file mode 100644 index 5764ff9..0000000 --- a/pull_agency_info_api.py +++ /dev/null @@ -1,282 +0,0 @@ -import csv -import requests -import json -import urllib.parse -import urllib3 -import argparse -import os -from datetime import datetime - -def get_all_agency_info(): - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute" - - params = { - "cacheable": "true", - "classname": "@udd/01p8z0000009E4V", - "isContinuation": "false", - "method": "getAgenciesDetail", - "namespace": "", - "params": json.dumps({"recordId": None}), - "language": "en-US", - "asGuest": "true", - "htmlEncode": "false" - } - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Accept': 'application/json, text/plain, */*', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' - } - - try: - print("GET request with recordId=null") - response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30) - response.raise_for_status() - return response.json() - except Exception as e: - print(f"GET request with recordId=null failed: {e}") - return None - -def get_agency_details(record_id): - """ - GET request with URL parameters directly to the API endpoint - """ - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute" - - # Build the exact URL from your example - params = { - "cacheable": "true", - "classname": "@udd/01p8z0000009E4V", - "isContinuation": "false", - "method": "getAgenciesDetail", - "namespace": "", - "params": json.dumps({"recordId": record_id}), - "language": "en-US", - "asGuest": "true", - "htmlEncode": "false" - } - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Accept': 'application/json, text/plain, */*', - 'Accept-Language': 'en-US,en;q=0.9', - 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' - } - - try: - print("Method 1: GET request with URL parameters") - response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30) - response.raise_for_status() - return response.json() - except Exception as e: - print(f"Method 1 failed: {e}") - return None - -# Get the files -def get_content_details_method(record_id): - """ - POST with JSON payload directly to the API endpoint - """ - urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - - base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute" - - # JSON payload - payload = { - "namespace": "", - "classname": "@udd/01p8z0000009E4V", - "method": "getContentDetails", - "isContinuation": False, - "params": { - "recordId": record_id - }, - "cacheable": False - } - - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', - 'Accept': 'application/json, text/plain, */*', - 'Content-Type': 'application/json', - 'X-Requested-With': 'XMLHttpRequest', - 'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov', - 'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/' - } - - try: - print("POST with JSON payload directly to the API endpoint") - print(f"Payload: {json.dumps(payload, indent=2)}") - - response = requests.post( - base_url, - json=payload, - headers=headers, - verify=False, - timeout=30 - ) - response.raise_for_status() - return response.json() - except Exception as e: - print(f"POST with JSON payload directly to the API endpoint failed: {e}") - if 'response' in locals(): - print(f"Response content: {response.text}") - return None - -def merge_agency_info(agency_csv, output_dir = ".", remove_files=False): - """ - Merges the agency details into the all agency info dictionary. - """ - date_str = datetime.now().strftime("%Y-%m-%d") - - # Build a mapping from agencyId to AgencyName - agency_names = {} - with open(agency_csv, mode='r', encoding='utf-8') as f: - reader = csv.DictReader(f) - for row in reader: - agency_id = row.get('agencyId') - agency_name = row.get('AgencyName') - if agency_id and agency_name: - agency_names[agency_id] = agency_name - - # Merge PDF content details for each agency - - combined_rows = [] - header = [] - for agency_id, agency_name in agency_names.items(): - pdf_csv = os.path.join(output_dir, f"{agency_id}_pdf_content_details.csv") - if os.path.exists(pdf_csv): - with open(pdf_csv, mode='r', encoding='utf-8') as f: - reader = csv.reader(f) - header = next(reader) - header = ['agency_name'] + header - for row in reader: - combined_rows.append([agency_name] + row) - else: - print(f"Warning: PDF content details CSV not found for agency ID {agency_id}, skipping...") - continue - - # Write out the combined CSV - combined_csv = os.path.join(output_dir, f"{date_str}_combined_pdf_content_details.csv") - with open(combined_csv, mode='w', newline='', encoding='utf-8') as f: - writer = csv.writer(f, quoting=csv.QUOTE_ALL) - # Write header: agency_id + original header - writer.writerow(header) - writer.writerows(combined_rows) - - print(f"Combined PDF content details written to {combined_csv}") - # If remove files then remove each file - if remove_files: - for agency_id, agency_name in agency_names.items(): - pdf_csv = os.path.join(output_dir, f"{agency_id}_pdf_content_details.csv") - json_path = os.path.join(output_dir, f"{agency_id}_pdf_content_details.json") - if os.path.exists(pdf_csv): - os.remove(pdf_csv) - print(f"Removed file: {pdf_csv}") - if os.path.exists(json_path): - os.remove(json_path) - print(f"Removed file: {json_path}") - return combined_csv - -# Test the functions -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.") - parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV and JSON files", default="./") - parser.add_argument("--overwrite", dest="overwrite", help="Overwrite existing files", default=True) - parser.add_argument("--remove-files", dest="remove_files", help="Remove individual agency files after merging", default=True) - parser.add_argument("--verbose", dest="verbose", help="Enable verbose output", default=False, action='store_true') - args = parser.parse_args() - output_dir = args.output_dir - - # # Patch all print statements in functions - # builtins.print = lambda *a, **kw: log_print(' '.join(str(x) for x in a), logging.INFO) - - os.makedirs(output_dir, exist_ok=True) - - all_agency_info = get_all_agency_info() - print(json.dumps(all_agency_info, indent=2)) - date_str = datetime.now().strftime("%Y-%m-%d") - agency_file = os.path.join(output_dir, f"{date_str}_all_agency_info.json") - - with open(agency_file, "w", encoding="utf-8") as f: - json.dump(all_agency_info, f, indent=2, ensure_ascii=False) - - print("All agency information saved to all_agency_info.json") - - # Extract the list from all_agency_info - agency_list = ( - all_agency_info.get('returnValue', {}) - .get('objectData', {}) - .get('responseResult', []) - ) - - # Define the columns to keep - keep_cols = [ - "Address", - "agencyId", - "AgencyName", - "AgencyType", - "City", - "County", - "LicenseEffectiveDate", - "LicenseeGroupOrganizationName", - "LicenseExpirationDate", - "LicenseNumber", - "LicenseStatus", - "Phone", - "ZipCode" - ] - - # Update CSV filename to include date - agency_csv_file = os.path.join(output_dir, f"{date_str}_agency_info.csv") - with open(agency_csv_file, mode='w', newline='', encoding='utf-8') as f: - writer = csv.DictWriter(f, fieldnames=keep_cols, quoting=csv.QUOTE_ALL) - writer.writeheader() - for agency in agency_list: - row = {col: agency.get(col, "") for col in keep_cols} - writer.writerow(row) - print(f"Agency info written to {agency_csv_file}") - - keep_cols = ['FileExtension', 'CreatedDate', 'Title', 'ContentBodyId', 'Id', 'ContentDocumentId'] - - # Run for each agency id - for agency in agency_list: - record_id = agency.get('agencyId') - csv_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.csv") - if not record_id: - print(f"Skipping agency ID {record_id} as it is empty.") - continue - if args.overwrite and os.path.exists(csv_file): - print(f"File {csv_file} already exists and overwrite is enabled, skipping agency ID {record_id}.") - continue - - print(f"Processing agency ID: {record_id}") - pdf_results = get_content_details_method(record_id) - - if pdf_results: - print(f"PDF Content Details for {record_id}:") - # print(json.dumps(pdf_results, indent=2)) - # Save full JSON response to file - json_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.json") - with open(json_file, "w", encoding="utf-8") as jf: - json.dump(pdf_results, jf, indent=2, ensure_ascii=False) - print(f"Full JSON results written to {json_file}") - - # Write top-level keys/values to CSV - csv_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.csv") - with open(csv_file, mode='w', newline='', encoding='utf-8') as f: - writer = csv.writer(f, quoting=csv.QUOTE_ALL) - # Write the header - writer.writerow(['agency_id'] + keep_cols) - for p in pdf_results.get('returnValue', {}).get('contentVersionRes', []): - row_data = [record_id] + [p.get(k, "") for k in keep_cols] - writer.writerow(row_data) - - print(f"Top-level JSON results written to {csv_file}") - else: - print(f"Failed to retrieve PDF content details for agency ID: {record_id}") - - merge_agency_info(agency_csv_file, output_dir, remove_files=args.remove_files) \ No newline at end of file