diff --git a/README.md b/README.md
deleted file mode 100644
index da7fda4..0000000
--- a/README.md
+++ /dev/null
@@ -1,62 +0,0 @@
-# MCYJ Parsing Script
-
-## 1. Get all the available documents from the Michigan Welfare public search API
-
-```bash
-python pull_agency_info_api.py --output-dir metadata_output --overwrite=False --verbose
-```
-
-This will output the agency info and correpsonding documents to the `metadata_output` directory.
-The default behavior will output all available documents in both json and csv formats.
-
-### 1. Output
-```bash
-ls metadata_output
-#> 2025-10-30_agency_info.csv
-#> 2025-10-30_all_agency_info.json
-#> 2025-10-30_combined_pdf_content_details.csv
-```
-
-## 2. Get a list of extra and missing files in the downloaded files
-
-```r
-python get_download_list.py --download-folder Downloads --available-files "metadata_output/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv"
-```
-
-### 2. Output
-```bash
-ls metadata_output
-#> 2025-10-30_agency_info.csv
-#> 2025-10-30_all_agency_info.json
-#> 2025-10-30_combined_pdf_content_details.csv
-#> extra_files.txt
-#> missing_files.csv
-```
-
-  - `extra_files.txt` contains files that are in `Downloads` but are not found from the API (most likely due to naming discrepancies)
-  - `missing_Files.csv` contains missing files in the csv format with header:
-
-```
-generated_filename,agency_name,agency_id,FileExtension,CreatedDate,Title,ContentBodyId,Id,ContentDocumentId
-```
-
-## 3. Download missing documents
-
-```bash
-python download_all_pdfs.py --csv metadata_output/missing_files.csv --output-dir Downloads
-```
-
-### 3. Output
-
-```bash
-$ ls downloads/ | head
-# 42ND_CIRCUIT_COURT_-_FAMILY_DIVISION_42ND_CIRCUIT_COURT_-_FAMILY_DIVISION_Interim_2025_2025-07-18_069cs0000104BR0AAM.pdf
-# ADOPTION_AND_FOSTER_CARE_SPECIALISTS,_INC._CB440295542_INSP_201_2020-03-14_0698z000005Hpu5AAC.pdf
-# ADOPTION_AND_FOSTER_CARE_SPECIALISTS,_INC._CB440295542_ORIG.pdf_2008-06-24_0698z000005HozQAAS.pdf
-# ADOPTION_ASSOCIATES,_INC_Adoption_Associates_INC_Renewal_2025_2025-08-20_069cs0000163byMAAQ.pdf
-# ADOPTION_OPTION,_INC._CB560263403_ORIG.pdf_2004-05-08_0698z000005Hp18AAC.pdf
-```
-
-## 4. Check duplicates and update file metadata
-
-check the md5sums
\ No newline at end of file
diff --git a/download_all_pdfs.py b/download_all_pdfs.py
deleted file mode 100644
index f02f731..0000000
--- a/download_all_pdfs.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-"""
-Script to download all PDFs listed in a CSV by calling download_michigan_pdf
-from `download_pdf.py` for each row.
-
-Expected CSV headers:
-generated_filename,agency_name,agency_id,FileExtension,CreatedDate,Title,ContentBodyId,Id,ContentDocumentId
-
-Usage:
-python download_all_pdfs.py --csv /path/to/file.csv --output-dir ./pdfs
-"""
-import csv
-import os
-import argparse
-import time
-from typing import Optional
-
-# Import functions from download_pdf.py
-try:
-    from download_pdf import download_michigan_pdf
-except Exception as e:
-    raise SystemExit(f"Failed to import download_michigan_pdf from download_pdf.py: {e}")
-
-
-def process_csv(csv_path: str, output_dir: str, skip_existing: bool = True, limit: Optional[int] = None, sleep_seconds: float = 0.0):
-    """Read CSV and call download_michigan_pdf for each row.
-
-    Parameters:
-        csv_path: path to input CSV
-        output_dir: directory where PDFs will be saved
-        skip_existing: if True and generated_filename present, skip if file exists
-        limit: optional max number of rows to process
-    """
-    if not os.path.exists(csv_path):
-        raise FileNotFoundError(f"CSV file not found: {csv_path}")
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    processed = 0
-    failed = 0
-
-    with open(csv_path, newline='', encoding='utf-8') as fh:
-        reader = csv.DictReader(fh)
-        for row in reader:
-            if limit is not None and processed >= limit:
-                break
-
-            # Extract required fields from the CSV header
-            gen_filename = (row.get('generated_filename') or '').strip()
-            agency_name = (row.get('agency_name') or '').strip()
-            agency_id = (row.get('agency_id') or '').strip()
-            file_ext = (row.get('FileExtension') or '').strip()
-            created_date = (row.get('CreatedDate') or '').strip()
-            title = (row.get('Title') or '').strip()
-            content_body_id = (row.get('ContentBodyId') or '').strip()
-            id_field = (row.get('Id') or '').strip()
-            content_document_id = (row.get('ContentDocumentId') or '').strip()
-
-            # The download function needs ContentDocumentId (document_id);
-            # fill other args from CSV.
-            if not content_document_id:
-                print(f"Skipping row with missing ContentDocumentId: {row}")
-                failed += 1
-                continue
-
-            # If a generated_filename is provided, optionally skip download when file exists
-            if gen_filename:
-                target_path = os.path.join(output_dir, gen_filename)
-                if skip_existing and os.path.exists(target_path):
-                    print(f"Skipping existing file: {target_path}")
-                    processed += 1
-                    continue
-
-            try:
-                print(f"Downloading document {content_document_id} (agency: {agency_name}, title: {title})")
-                out_path = download_michigan_pdf(
-                    document_id=content_document_id,
-                    document_agency=agency_name if agency_name else None,
-                    document_name=title if title else None,
-                    document_date=created_date if created_date else None,
-                    output_dir=output_dir
-                )
-
-                if out_path:
-                    print(f"Saved to: {out_path}")
-                else:
-                    print(f"Download returned None for {content_document_id}")
-                    failed += 1
-
-            except Exception as e:
-                print(f"Error downloading {content_document_id}: {e}")
-                failed += 1
-
-            processed += 1
-            # Sleep between downloads if requested
-            if sleep_seconds and sleep_seconds > 0:
-                try:
-                    print(f"Sleeping for {sleep_seconds} seconds...")
-                    time.sleep(sleep_seconds)
-                except KeyboardInterrupt:
-                    print("Sleep interrupted by user.")
-                    break
-
-    print(f"Done. Processed: {processed}. Failures: {failed}.")
-    return processed, failed
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Download PDFs listed in a CSV using download_michigan_pdf from download_pdf.py')
-    parser.add_argument('--csv', required=True, help='Path to input CSV file')
-    parser.add_argument('--output-dir', required=True, help='Directory to save downloaded PDFs')
-    parser.add_argument('--no-skip', dest='skip_existing', action='store_false', help='Do not skip when generated_filename exists')
-    parser.add_argument('--limit', type=int, default=None, help='Optional max number of rows to process')
-    parser.add_argument('--sleep', dest='sleep_seconds', type=float, default=0.0, help='Seconds to sleep between downloads (float allowed)')
-
-    args = parser.parse_args()
-
-    process_csv(args.csv, args.output_dir, skip_existing=args.skip_existing, limit=args.limit, sleep_seconds=args.sleep_seconds)
diff --git a/download_pdf.py b/download_pdf.py
deleted file mode 100644
index d59e1a0..0000000
--- a/download_pdf.py
+++ /dev/null
@@ -1,193 +0,0 @@
-import requests
-import base64
-import urllib3
-import os
-import re
-import argparse
-
-def get_content_base_data(document_id):
-    """
-    POST request to fetch content base data for a given ContentDocumentId.
-    """
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    # Use same base endpoint as other functions; include the query params
-    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute?language=en-US&asGuest=true&htmlEncode=false"
-
-    payload = {
-        "namespace": "",
-        "classname": "@udd/01p8z0000009E4V",
-        "method": "getContentBaseData",
-        "isContinuation": False,
-        "params": {
-            "contentDocumentId": document_id,
-            "actionName": "download"
-        },
-        "cacheable": False
-    }
-
-    headers = {
-        'Accept': 'application/json, text/plain, */*',
-        'Content-Type': 'application/json',
-        'X-Requested-With': 'XMLHttpRequest',
-        'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov',
-        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
-    }
-
-    try:
-        print(f"POST getContentBaseData for ContentDocumentId={document_id}")
-        response = requests.post(
-            base_url,
-            json=payload,
-            headers=headers,
-            verify=False,
-            timeout=60
-        )
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        print(f"get_document_body failed for {document_id}: {e}")
-        if 'response' in locals():
-            try:
-                print(f"Response content: {response.text}")
-            except Exception:
-                pass
-        return None
-
-# Note: I think we can do the same thing here using get_content_base_data
-def download_michigan_pdf(document_id, document_agency=None, document_name=None, document_date=None, output_dir="./"):
-    """
-    Download a PDF from Michigan Child Welfare Public Licensing Search
-
-    Args:
-        document_id (str): The document ID (e.g., "0698z0000061FxYAAU")
-        document_agency (str, optional): Name of the agency for filename
-        document_name (str, optional): Name of the document for filename
-        output_dir (str): Directory to save the PDF (default: current directory)
-
-    Returns:
-        str: Path to the downloaded file if successful, None if failed
-    """
-
-    # Disable SSL warnings
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    # Headers to mimic a real browser
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
-        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
-        'Accept-Language': 'en-US,en;q=0.5',
-        'Accept-Encoding': 'gzip, deflate',
-        'Connection': 'keep-alive',
-        'Upgrade-Insecure-Requests': '1'
-    }
-
-    try:
-        # Make the request
-        res = get_content_base_data(document_id=document_id)
-        base64_data = res['returnValue']
-
-        # Decode the PDF content
-        pdf_content = base64.b64decode(base64_data)
-
-        # Generate filename
-        filename = generate_filename(document_id, document_agency, document_name, document_date)
-
-        # Ensure output directory exists
-        os.makedirs(output_dir, exist_ok=True)
-
-        # Full path for the file
-        file_path = os.path.join(output_dir, filename)
-
-        # Save the PDF
-        with open(file_path, 'wb') as f:
-            f.write(pdf_content)
-
-        print(f"PDF downloaded successfully: {file_path}")
-        print(f"File size: {len(pdf_content)} bytes")
-
-        return file_path
-
-    except requests.exceptions.RequestException as e:
-        print(f"Error making request: {e}")
-        return None
-    except Exception as e:
-        print(f"Error processing PDF: {e}")
-        return None
-
-def generate_filename(document_id, document_agency, document_name, document_date):
-    """
-    Generate a filename based on the provided parameters
-
-    Args:
-        document_id (str): The document ID
-        document_agency (str): Agency name
-        document_name (str): Document name
-        document_date (str): Document date (not used in this version)
-
-    Returns:
-        str: Generated filename
-    """
-    # Clean up strings to be filesystem-safe
-    def clean_string(s):
-        if not s:
-            return ""
-        # Remove/replace problematic characters
-        s = re.sub(r'[<>:"/\\|?*]', '_', s)
-        # Remove extra whitespace
-        s = re.sub(r'\s+', '_', s)
-        # Remove leading/trailing underscores
-        s = s.strip('_')
-        return s
-
-    # Build filename components
-    parts = []
-
-    if document_agency:
-        parts.append(clean_string(document_agency))
-
-    if document_name:
-        parts.append(clean_string(document_name))
-
-    if document_date:
-        # Ensure the date is in YYYY-MM-DD format
-        match = re.match(r'(\d{4})[-/](\d{2})[-/](\d{2})', str(document_date))
-        # Raise error if month > 12
-        if match:
-            if int(match.group(2)) > 12:
-                raise ValueError("Month in document date cannot be greater than 12")
-            formatted_date = f"{match.group(1)}-{match.group(2)}-{match.group(3)}"
-            parts.append(formatted_date)
-        else:
-            # If not in correct format, skip or handle as needed
-            raise ValueError("Document date must be in YYYY-MM-DD format")
-
-    # Always include the document ID
-    parts.append(clean_string(document_id))
-
-    # Join parts with underscores
-    filename = '_'.join(parts)
-
-    # Ensure it ends with .pdf
-    if not filename.lower().endswith('.pdf'):
-        filename += '.pdf'
-
-    return filename
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download Michigan Child Welfare PDF by document ID.")
-    parser.add_argument("document_id", help="The document ID (e.g., 0698z0000061FxYAAU)")
-    parser.add_argument("--agency", dest="document_agency", help="Agency name for filename", default=None)
-    parser.add_argument("--name", dest="document_name", help="Document name for filename", default=None)
-    parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the PDF", default="./")
-    parser.add_argument("--date", dest="document_date", help="Document date for filename (YYYY-MM-DD)", default=None)
-
-    args = parser.parse_args()
-
-    download_michigan_pdf(
-        document_id=args.document_id,
-        document_agency=args.document_agency,
-        document_name=args.document_name,
-        document_date=args.document_date,
-        output_dir=args.output_dir
-    )
\ No newline at end of file
diff --git a/get_download_list.py b/get_download_list.py
deleted file mode 100644
index 04e020a..0000000
--- a/get_download_list.py
+++ /dev/null
@@ -1,105 +0,0 @@
-import os
-import argparse
-import csv
-from datetime import datetime
-import re
-
-def get_downloaded_files(download_folder, lower = True):
-    all_files = os.listdir(download_folder)
-    if lower:
-        return set(f.lower() for f in all_files)
-    else:
-        return set(all_files)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Check for missing downloaded files.")
-    parser.add_argument("--download-folder", help="Folder containing downloaded files")
-    parser.add_argument("--available-files", help="File listing expected files")
-    args = parser.parse_args()
-
-    downloaded_files = get_downloaded_files(args.download_folder)
-    downloaded_files_no_date = {re.sub(r'_\d{4}-\d{2}-\d{2}\.pdf$', '', f) for f in downloaded_files if f.endswith('.pdf')}
-    expected_files = set()
-    expected_files_no_date = set()
-    expected_files_info = []  # Store complete row information
-    filename_to_row = {}  # Map filename to complete row
-
-    # Get the directory of the available-files for output
-    available_files_dir = os.path.dirname(args.available_files)
-    if not available_files_dir:
-        available_files_dir = "."  # Current directory if no path specified
-
-    # Read the expected files as csv dict
-    with open(args.available_files, "r") as csvfile:
-        reader = csv.DictReader(csvfile)
-        for row in reader:
-            #print(row)
-            document_agency = row.get("agency_name", "")
-            document_agency = document_agency.strip().replace(" ", "_").replace("/", "_")
-            document_name = row.get("Title", "")
-            document_name = document_name.strip().replace(" ", "_").replace("/", "-")
-            created_date = row.get("CreatedDate", "")
-            extension = row.get("FileExtension", "pdf")
-            #print(f"Processing: {document_agency}, {document_name}, {created_date}, {extension}")
-    #        datetime.strptime(sanitized_date, '%m-%d-%Y').date()
-            document_date = datetime.strptime(created_date, "%Y-%m-%dT%H:%M:%S.%fZ").date()
-            filename_with_date = f"{document_agency}_{document_name}_{document_date}.{extension}".lower()
-            filename_no_date = f"{document_agency}_{document_name}".lower()
-
-            expected_files.add(filename_with_date)
-            expected_files_no_date.add(filename_no_date)
-            expected_files_info.append(row)
-            filename_to_row[filename_no_date] = row
-
-
-        # If the files have the same agency and name, but the date is different, we consider it a different file
-
-        extra_files = downloaded_files_no_date - expected_files_no_date
-        missing_files = expected_files_no_date - downloaded_files_no_date
-
-#        print(list(downloaded_files)[:5])
-
-        # Print intersection
-        print("Files in both downloaded and expected:")
-        for f in sorted(downloaded_files & expected_files):
-            print(f)
-
-        print("Files in downloaded but not expected:")
-        # Write download files to a file
-        extra_files_path = os.path.join(available_files_dir, "extra_files.txt")
-        with open(extra_files_path, "w") as f:
-            for file in sorted(extra_files):
-                f.write(file + "\n")
-
-        # Write expected files to a file using CSV writer
-        missing_files_path = os.path.join(available_files_dir, "missing_files.csv")
-        with open(missing_files_path, "w", newline='') as f:
-            if missing_files and expected_files_info:
-                # Get the first row to extract headers
-                headers = list(expected_files_info[0].keys())
-                writer = csv.writer(f)
-
-                # Write header row with filename as additional column
-                writer.writerow(["generated_filename"] + headers)
-
-                # Write data rows for missing files
-                for file in sorted(missing_files):
-                    if file in filename_to_row:
-                        row = filename_to_row[file]
-                        row_values = [row.get(header, "") for header in headers]
-                        # Add file extension to the generated filename
-                        file_extension = row.get("FileExtension", "pdf")
-                        filename_with_extension = f"{file}.{file_extension}"
-                        writer.writerow([filename_with_extension] + row_values)
-                    else:
-                        # If no row data available, write just the filename
-                        writer.writerow([file] + [""] * len(headers))
-            else:
-                # Fallback to just filenames if no row data available
-                writer = csv.writer(f)
-                writer.writerow(["generated_filename"])  # Simple header
-                for file in sorted(missing_files):
-                    writer.writerow([file])
-
-        print(len(missing_files), "missing files found.")
-        print(len(extra_files), "extra files found.")
\ No newline at end of file
diff --git a/ingestion/README.md b/ingestion/README.md
new file mode 100644
index 0000000..63500bd
--- /dev/null
+++ b/ingestion/README.md
@@ -0,0 +1,107 @@
+# MCYJ Primary Document Ingestion
+
+This directory contains scripts for downloading the current set of documents available from the [Michigan Welfare Licensing Search](https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/). The pipeline uses two separate directories:
+
+1. **Download directory**: Persistent storage for PDF documents (accumulates over time)
+2. **Run directory**: Run-specific metadata, logs, and artifacts for each round of ingestion
+
+The scripts are idempotent - running them multiple times will only download new documents.
+
+## Quick Start
+
+The ingestion process consists of two simple steps:
+
+### Step 1: Get Agency Metadata
+
+Pull all available documents from the Michigan API:
+
+```bash
+python pull_agency_info_api.py --run-dir run_2025-11-03 --overwrite=False --verbose
+```
+
+**Output** (in `run_2025-11-03/`):
+- `YYYY-MM-DD_agency_info.csv`
+- `YYYY-MM-DD_all_agency_info.json`
+- `YYYY-MM-DD_combined_pdf_content_details.csv`
+
+### Step 2: Download Documents
+
+Download PDFs directly from the content details CSV:
+
+```bash
+python download_all_pdfs.py --csv "run_2025-11-03/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv" --download-dir Downloads
+```
+
+The download script automatically:
+- Detects existing files by ContentDocumentId pattern (`*_{ContentDocumentId}.pdf`)
+- Skips files that already exist (unless `--no-skip` is used)
+- Verifies content integrity using SHA256 hashes
+- Handles files with different agency names (agency renames, etc.)
+
+**Options:**
+- `--no-skip`: Re-download existing files (default: skip existing files)
+- `--limit N`: Download at most N files (useful for testing)
+- `--sleep SECONDS`: Delay between downloads (default: 0.1)
+
+### Directory Structure
+
+After running, you'll have two separate directories:
+
+**Download directory** (`Downloads/`):
+- PDF files named: `AGENCY_NAME_DOCUMENT_TITLE_YYYY-MM-DD.pdf`
+- This directory persists and accumulates PDFs across multiple runs
+
+**Run directory** (`run_2025-11-03/`):
+- `YYYY-MM-DD_agency_info.csv` - Agency information
+- `YYYY-MM-DD_all_agency_info.json` - Complete API response
+- `YYYY-MM-DD_combined_pdf_content_details.csv` - All available documents
+- This directory is specific to each run and contains all metadata/logs
+
+## Example: Test Run
+
+To test with only 10 downloads:
+
+```bash
+# Step 1: Get metadata
+python pull_agency_info_api.py --run-dir run_test --verbose
+
+# Step 2: Download with limit
+python download_all_pdfs.py --csv "run_test/$(date +"%Y-%m-%d")_combined_pdf_content_details.csv" --download-dir Downloads --limit 10
+```
+
+## Re-running the Pipeline
+
+The ingestion pipeline is designed to be run repeatedly:
+
+1. **Daily updates**: Run the two-step process daily with a new run directory to pick up new documents
+2. **Incremental downloads**: Only new/missing documents are downloaded
+3. **Metadata refresh**: Use `--overwrite=True` with pull_agency_info_api.py to force metadata refresh
+
+```bash
+# Daily cron job example - creates a new run directory each day
+# Step 1: Get metadata
+0 2 * * * cd /path/to/ingestion && python pull_agency_info_api.py --run-dir "run_$(date +\%Y-\%m-\%d)"
+# Step 2: Download files  
+5 2 * * * cd /path/to/ingestion && python download_all_pdfs.py --csv "run_$(date +\%Y-\%m-\%d)/$(date +\%Y-\%m-\%d)_combined_pdf_content_details.csv" --download-dir Downloads
+```
+
+### Directory Best Practices
+
+- **Download directory**: Single persistent directory (e.g., `Downloads/`) that accumulates all PDFs
+- **Run directories**: Create a new one for each execution (e.g., `run_2025-11-03`, `run_2025-11-04`)
+- Keep run directories for audit trails and historical metadata
+
+## Known Limitations
+
+The Michigan API has an important limitation regarding document versioning:
+
+- **The API only exposes the latest version of each document**
+- Historical versions of documents are NOT accessible through the API
+- Each document has a unique `ContentDocumentId` (069...) used for downloading
+- Documents also have a `ContentVersionId` (068...) that changes when updated
+- However, the API does NOT support downloading by `ContentVersionId`
+- When a document is updated, the old version becomes inaccessible
+
+However, we believe that, in fact, the way Michigan is using their platform, there are *never* multiple versions.  In fact, we sometimes see the opposite: several identical files with different ContentDocumentId.  So for now we are operating as if ContentDocumentId is sufficient.  If we ever come across a situation where the pdf associated with a ContentDocumentId changes, we will adjust accordingly then.
+
+**File Naming**: Files are named `{agency_name}_{ContentDocumentId}.pdf` where ContentDocumentId uniquely identifies each document (but only its latest version is downloadable).
\ No newline at end of file
diff --git a/ingestion/download_all_pdfs.py b/ingestion/download_all_pdfs.py
new file mode 100644
index 0000000..86bfbe0
--- /dev/null
+++ b/ingestion/download_all_pdfs.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python3
+"""
+Batch PDF Downloader - Downloads PDFs from Content Details CSV
+
+This script downloads PDFs listed in a CSV file (such as combined_pdf_content_details.csv)
+by calling download_michigan_pdf from download_pdf.py for each row. It handles batch
+downloads with progress tracking, error handling, and rate limiting.
+
+The script performs these operations:
+1. Reads a CSV file containing document metadata
+2. For each document, checks if it already exists (by ContentDocumentId pattern)
+3. Downloads missing PDFs from the Michigan API
+4. Skips files that already exist (configurable with --no-skip)
+5. Applies rate limiting between downloads
+6. Reports progress and failures
+
+Expected CSV headers:
+    ContentDocumentId,agency_name (required)
+    Other columns are ignored but can be present
+
+File Naming Convention:
+    Files are named using ContentDocumentId as the unique identifier.
+    Format: {agency_name}_{ContentDocumentId}.pdf
+    Example: glens_house_0698z0000061FxYAAU.pdf
+
+    Note: The API only exposes the latest version of each document. Historical versions
+    are not accessible.
+
+Usage:
+    python download_all_pdfs.py --csv combined_pdf_content_details.csv --download-dir Downloads [--limit 100] [--sleep 0.5]
+
+Options:
+    --csv: Path to CSV file with document metadata (e.g., combined_pdf_content_details.csv)
+    --download-dir: Directory to save downloaded PDFs
+    --no-skip: Re-download files even if they exist
+    --limit: Maximum number of files to download (for testing)
+    --sleep: Seconds to sleep between downloads (default: 0.1)
+
+Author: STATCOM MCYJ project
+"""
+import csv
+import os
+import argparse
+import time
+import logging
+from typing import Optional
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+# Import functions from download_pdf.py
+from download_pdf import download_michigan_pdf, generate_filename
+import glob
+
+def process_csv(csv_path: str, output_dir: str, skip_existing: bool = True, limit: Optional[int] = None, sleep_seconds: Optional[float] = 0.1):
+    """Read CSV and call download_michigan_pdf for each row.
+
+    Parameters:
+        csv_path: path to input CSV
+        output_dir: directory where PDFs will be saved
+        skip_existing: if True and file with matching ContentDocumentId
+            is present, skip if file exists.  If False, download and verify
+            that sha256 matches existing file (we will throw an exception and
+            abort entire run if sha256 does not match, as this would indicates data
+            inconsistency that requires investigation).
+        limit: optional max number of rows to process
+        sleep_seconds: seconds to sleep between downloads to respect
+            rate limiting and avoid server overload.
+    """
+
+    if not os.path.exists(csv_path):
+        raise FileNotFoundError(f"CSV file not found: {csv_path}")
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    processed = 0
+    failed = 0
+
+    with open(csv_path, newline='', encoding='utf-8') as fh:
+        reader = csv.DictReader(fh)
+        for row in reader:
+            if limit is not None and processed >= limit:
+                break
+
+            # Extract required fields from the CSV header
+            content_document_id = (row.get('ContentDocumentId') or '').strip()
+            agency_name = (row.get('agency_name') or '').strip()
+
+            # Validate required fields
+            if not content_document_id:
+                logger.warning(f"Skipping row with missing ContentDocumentId: {row}")
+                failed += 1
+                continue
+
+            # Use glob to find existing files with pattern *_ContentDocumentId.pdf
+            pattern = os.path.join(output_dir, f"*_{content_document_id}.pdf")
+            existing_files = glob.glob(pattern)
+
+            # if there is exactly one existing file
+            # we consider that the filename to use
+            # (and possibly skip download)
+            if len(existing_files)==1:
+                filename = os.path.basename(existing_files[0])
+                if skip_existing:
+                    logger.info(f"Skipping existing file: {filename}")
+                    processed += 1
+                    continue
+            elif len(existing_files) > 1:
+                raise ValueError(f"Multiple existing files found for ContentDocumentId={content_document_id}: {existing_files}")
+            else:
+                filename = generate_filename(content_document_id, agency_name)
+
+            try:
+                logger.info(f"Downloading document {content_document_id} (agency: {agency_name})")
+                file_path = os.path.join(output_dir, filename)
+                out_path = download_michigan_pdf(
+                    document_id=content_document_id,
+                    file_path=file_path
+                )
+                logger.info(f"Saved to: {out_path}")
+
+            except Exception as e:
+                logger.error(f"Error downloading {content_document_id}: {e}")
+                failed += 1
+
+            processed += 1
+            # Sleep between downloads if requested
+            if sleep_seconds and sleep_seconds > 0:
+                try:
+                    logger.debug(f"Sleeping for {sleep_seconds} seconds...")
+                    time.sleep(sleep_seconds)
+                except KeyboardInterrupt:
+                    logger.info("Sleep interrupted by user.")
+                    break
+
+    logger.info(f"Done. Processed: {processed}. Failures: {failed}.")
+    return processed, failed
+
+
+if __name__ == '__main__':
+    # Set up logging for script use
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    parser = argparse.ArgumentParser(description='Download PDFs from content details CSV using download_michigan_pdf')
+    parser.add_argument('--csv', required=True, help='Path to content details CSV file (e.g., combined_pdf_content_details.csv)')
+    parser.add_argument('--download-dir', required=True, help='Directory to save downloaded PDFs')
+    parser.add_argument('--no-skip', dest='skip_existing', action='store_false', help='Do not skip when generated_filename exists')
+    parser.add_argument('--limit', type=int, default=None, help='Optional max number of rows to process')
+    parser.add_argument('--sleep', dest='sleep_seconds', type=float, default=0.1, help='Seconds to sleep between downloads (float allowed)')
+
+    args = parser.parse_args()
+
+    process_csv(args.csv, args.download_dir, skip_existing=args.skip_existing, limit=args.limit, sleep_seconds=args.sleep_seconds)
diff --git a/ingestion/download_pdf.py b/ingestion/download_pdf.py
new file mode 100644
index 0000000..5eb5eb9
--- /dev/null
+++ b/ingestion/download_pdf.py
@@ -0,0 +1,263 @@
+"""
+Single PDF Downloader - Core Download Function
+
+This script provides the core functionality for downloading a single PDF document
+from the Michigan Child Welfare Public Licensing Search system. It can be used
+as a standalone script or imported by other scripts (like download_all_pdfs.py).
+
+The script performs these operations:
+1. Fetches document content from the Michigan API using ContentDocumentId
+2. Decodes the base64-encoded PDF data
+3. Generates a standardized filename from agency, document name, and date
+4. Saves the PDF to the specified download directory
+
+Usage as standalone:
+    python download_pdf.py <document_id> --csv run_2025-11-03/combined_pdf_content_details.csv --download-dir Downloads
+
+Usage as module:
+    from download_pdf import download_michigan_pdf
+
+    # Download with explicit file path
+    download_michigan_pdf(document_id="0698z0000061FxYAAU", file_path="Downloads/my_document.pdf")
+
+Arguments (standalone):
+    document_id: The ContentDocumentId from the API (required)
+    --csv: CSV file to lookup agency name (required)
+    --download-dir: Directory to save the PDF (default: current directory)
+
+Note: If a file already exists, the script will download the content and compare SHA256
+hashes. If they match, it succeeds without overwriting. If they differ, it raises an error.
+
+Author: STATCOM MCYJ project
+"""
+
+import requests
+import base64
+import urllib3
+import os
+import re
+import argparse
+import logging
+import hashlib
+from io import BytesIO
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+def get_content_base_data(document_id):
+    """
+    POST request to fetch content base data for a given ContentDocumentId.
+
+    Args:
+        document_id (str): ContentDocumentId (069...)
+
+    Returns:
+        dict: JSON response with base64-encoded PDF data
+        None: If request fails
+
+    Note:
+        The API only supports downloading by ContentDocumentId, which always returns
+        the latest version. ContentVersionId cannot be used for downloading.
+    """
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    # Use same base endpoint as other functions; include the query params
+    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute?language=en-US&asGuest=true&htmlEncode=false"
+
+    params_dict = {
+        "contentDocumentId": document_id,
+        "actionName": "download"
+    }
+
+    payload = {
+        "namespace": "",
+        "classname": "@udd/01p8z0000009E4V",
+        "method": "getContentBaseData",
+        "isContinuation": False,
+        "params": params_dict,
+        "cacheable": False
+    }
+
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Content-Type': 'application/json',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov',
+        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
+    }
+
+    logger.info(f"POST getContentBaseData for ContentDocumentId={document_id}")
+    response = requests.post(
+        base_url,
+        json=payload,
+        headers=headers,
+        verify=False,
+        timeout=60
+    )
+    response.raise_for_status()
+    return response.json()
+
+def download_michigan_pdf(document_id, file_path):
+    """
+    Download a PDF from Michigan Child Welfare Public Licensing Search
+
+    Args:
+        document_id (str): The ContentDocumentId (e.g., "0698z0000061FxYAAU")
+        file_path (str): Where to download the PDF file
+
+    Returns:
+        str: Path to the downloaded file if successful, None if failed
+
+    Raises:
+        ValueError: If API request fails or if existing file has different content (SHA256 mismatch)
+
+    Note:
+        Downloads using ContentDocumentId, which always returns the latest version.
+        The API does not support downloading historical versions.
+        If file exists, compares SHA256 hash - succeeds if identical, raises ValueError if different.
+    """
+
+    # Disable SSL warnings
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    logger.info(f"Downloading document: {document_id}")
+    res = get_content_base_data(document_id=document_id)
+
+    if not res or 'returnValue' not in res:
+        raise ValueError(f"Failed to get PDF data from API for document ID: {document_id}")
+
+    base64_data = res['returnValue']
+
+    # Decode the PDF content
+    pdf_content = base64.b64decode(base64_data)
+
+    # Calculate SHA256 of the new content
+    new_content_hash = hashlib.sha256(pdf_content).hexdigest()
+
+    # Check if file exists and compare hashes
+    if os.path.exists(file_path):
+        logger.info(f"File already exists: {file_path}")
+        logger.info("Comparing SHA256 hashes...")
+
+        # Read existing file and calculate its hash
+        with open(file_path, 'rb') as f:
+            existing_content = f.read()
+        existing_content_hash = hashlib.sha256(existing_content).hexdigest()
+
+        if new_content_hash == existing_content_hash:
+            logger.info(f"SHA256 hashes match - file is identical")
+            logger.info(f"Downloaded {len(pdf_content)} bytes and verified existing file")
+            return file_path
+        else:
+            raise ValueError(
+                f"File exists but content differs (SHA256 mismatch):\n"
+                f"  Existing: {existing_content_hash}\n"
+                f"  New:      {new_content_hash}\n"
+                f"  File:     {file_path}"
+            )
+
+    # Save the PDF (only if file doesn't exist or hashes match)
+    with open(file_path, 'wb') as f:
+        f.write(pdf_content)
+
+    logger.info(f"Downloaded {len(pdf_content)} bytes")
+
+    return file_path
+
+def generate_filename(document_id, document_agency):
+    """
+    Generate a filename based on ContentDocumentId and agency name.
+
+    Args:
+        document_id (str): The ContentDocumentId (e.g., "0698z0000061FxYAAU")
+        document_agency (str): Agency name
+
+    Returns:
+        str: Generated filename in format: {agency_name}_{document_id}.pdf
+
+    Example:
+        generate_filename("0698z0000061FxYAAU", "Glen's House")
+        -> "glens_house_0698z0000061FxYAAU.pdf"
+    """
+    # Clean up agency name to be filesystem-safe
+    def clean_string(s):
+        if not s:
+            return ""
+        # Remove/replace problematic characters
+        s = re.sub(r'[<>:"/\\|?*]', '_', s)
+        # Replace spaces and special chars with underscores
+        s = re.sub(r'\s+', '_', s)
+        # Remove apostrophes and quotes
+        s = s.replace("'", "").replace('"', '')
+        # Remove leading/trailing underscores
+        s = s.strip('_')
+        return s.lower()
+
+    # Build filename: {agency}_{document_id}.pdf
+    agency_clean = clean_string(document_agency) if document_agency else "unknown"
+
+    # If no document_id provided, use placeholder
+    if not document_id:
+        import time
+        document_id = f"unknown_{int(time.time())}"
+
+    # Format: {agency_name}_{document_id}.pdf
+    filename = f"{agency_clean}_{document_id}.pdf"
+
+    return filename
+
+if __name__ == "__main__":
+    # Set up logging for script use
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+        datefmt='%Y-%m-%d %H:%M:%S'
+    )
+
+    parser = argparse.ArgumentParser(
+        description="Download Michigan Child Welfare PDF by document ID."
+    )
+    parser.add_argument("document_id", help="The document ID (ContentDocumentId, e.g., 0698z0000061FxYAAU)")
+    parser.add_argument("--csv", required=True, help="CSV file to lookup agency name (e.g., combined_pdf_content_details.csv)")
+    parser.add_argument("--download-dir", dest="output_dir", help="Directory to save the PDF", default="./")
+
+    args = parser.parse_args()
+
+    # Look up agency name from CSV
+    import csv as csv_module
+    agency_name = None
+    with open(args.csv, 'r', encoding='utf-8') as f:
+        reader = csv_module.DictReader(f)
+        for row in reader:
+            if row.get('ContentDocumentId', '').strip() == args.document_id:
+                agency_name = row.get('agency_name', '').strip()
+                break
+
+    if not agency_name:
+        logger.error(f"Could not find ContentDocumentId={args.document_id} in {args.csv}")
+        logger.error("Make sure the CSV has 'ContentDocumentId' and 'agency_name' columns")
+        exit(1)
+
+    logger.info(f"Found agency in CSV: {agency_name}")
+
+    # Ensure output directory exists
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Generate filename and full file path
+    filename = generate_filename(args.document_id, agency_name)
+    file_path = os.path.join(args.output_dir, filename)
+
+    try:
+        result = download_michigan_pdf(
+            document_id=args.document_id,
+            file_path=file_path
+        )
+    except Exception as e:
+        logger.error(f"Download failed: {e}")
+        exit(1)
+
+    if result:
+        logger.info(f"Success! File saved to: {result}")
+    else:
+        logger.error("Download failed")
+        exit(1)
\ No newline at end of file
diff --git a/ingestion/pull_agency_info_api.py b/ingestion/pull_agency_info_api.py
new file mode 100644
index 0000000..8d3f082
--- /dev/null
+++ b/ingestion/pull_agency_info_api.py
@@ -0,0 +1,384 @@
+"""
+Michigan Child Welfare Agency Metadata Retrieval
+
+This script pulls agency information and associated document metadata from the
+Michigan Child Welfare Public Licensing Search API. It fetches data for all
+agencies and their available documents, saving the results to a run directory.
+
+The script performs these operations:
+1. Fetches all agency information from the API
+2. For each agency, retrieves associated document metadata
+3. Saves all agency info and document details as JSON,
+   and most of the key information is also stored as CSV
+4. Merges all document metadata into a single combined CSV file
+
+API Functions Overview:
+    Two main functions interact with the Michigan API:
+
+    1. get_all_agency_info()
+       - Fetches list of ALL agencies with basic info (name, address, license)
+       - Called ONCE at start of pipeline
+       - API method: 'getAgenciesDetail' with recordId=None
+
+    2. get_content_details_method(record_id)
+       - Fetches DOCUMENT listings for ONE specific agency
+       - Called FOR EACH agency to get their PDF metadata
+       - API method: 'getContentDetails' with specific recordId
+       - Returns list of PDFs with ContentDocumentId needed for download
+
+    Key Distinction:
+        - getAgenciesDetail: Returns AGENCY information (who they are)
+        - getContentDetails: Returns DOCUMENT listings (what files they have)
+
+Usage:
+    python pull_agency_info_api.py --run-dir run_2025-11-03 [--verbose]
+
+Output (in run directory):
+    - YYYY-MM-DD_all_agency_info.json: Complete API response with all agencies
+    - YYYY-MM-DD_agency_info.csv: Agency information in CSV format
+    - YYYY-MM-DD_combined_pdf_content_details.csv: All documents from all agencies
+    - Individual JSON/CSV files per agency (removed after merging by default)
+
+Author: STATCOM MCYJ project
+"""
+
+import csv
+import requests
+import json
+import urllib.parse
+import urllib3
+import argparse
+import os
+import logging
+from datetime import datetime
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+def get_all_agency_info():
+    """
+    Fetches basic information for ALL agencies from the Michigan API.
+
+    This function calls the 'getAgenciesDetail' API method with recordId=None to
+    retrieve a list of all child welfare agencies in Michigan, including basic
+    information like agency name, address, license status, etc.
+
+    Returns:
+        dict: JSON response containing a list of all agencies with their basic info
+        None: If the request fails
+
+    Note: This returns agency metadata ONLY, not the documents/PDFs associated
+    with each agency. Use get_content_details_method() to fetch documents for
+    a specific agency.
+    """
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute"
+
+    params = {
+        "cacheable": "true",
+        "classname": "@udd/01p8z0000009E4V",
+        "isContinuation": "false",
+        "method": "getAgenciesDetail",
+        "namespace": "",
+        "params": json.dumps({"recordId": None}),
+        "language": "en-US",
+        "asGuest": "true",
+        "htmlEncode": "false"
+    }
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'en-US,en;q=0.9',
+        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
+    }
+
+    try:
+        logger.info("GET request with recordId=null")
+        response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30)
+        response.raise_for_status()
+        return response.json()
+    except Exception as e:
+        logger.error(f"GET request with recordId=null failed: {e}")
+        return None
+
+def get_content_details_method(record_id):
+    """
+    Fetches document/PDF metadata for a SPECIFIC agency by its record ID.
+
+    This is the function used to get the list of documents associated with a
+    specific agency.
+
+    This function calls the 'getContentDetails' API method with a specific
+    recordId to get a list of all documents (PDFs) associated with that agency,
+    including metadata like:
+        - Title: Document name/title
+        - CreatedDate: When the document was created
+        - FileExtension: Usually 'pdf'
+        - ContentDocumentId: Unique ID needed to download the actual PDF file
+        - ContentBodyId: Internal reference ID
+
+    Args:
+        record_id (str): The agency's unique identifier (agencyId)
+
+    Returns:
+        dict: JSON response containing list of documents in 'contentVersionRes' key
+        None: If the request fails
+
+    """
+    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute"
+
+    # JSON payload
+    payload = {
+        "namespace": "",
+        "classname": "@udd/01p8z0000009E4V",
+        "method": "getContentDetails",
+        "isContinuation": False,
+        "params": {
+            "recordId": record_id
+        },
+        "cacheable": False
+    }
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+        'Accept': 'application/json, text/plain, */*',
+        'Content-Type': 'application/json',
+        'X-Requested-With': 'XMLHttpRequest',
+        'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov',
+        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
+    }
+
+    logger.info("POST with JSON payload directly to the API endpoint")
+    logger.debug(f"Payload: {json.dumps(payload, indent=2)}")
+
+    response = requests.post(
+        base_url,
+        json=payload,
+        headers=headers,
+        verify=False,
+        timeout=30
+    )
+    response.raise_for_status()
+    return response.json()
+
+
+def merge_agency_info(agency_csv, run_dir = ".", remove_files=False):
+    """
+    Merges the agency details into the all agency info dictionary.
+    """
+    date_str = datetime.now().strftime("%Y-%m-%d")
+
+    # Build a mapping from agencyId to AgencyName
+    agency_names = {}
+    with open(agency_csv, mode='r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            agency_id = row.get('agencyId')
+            agency_name = row.get('AgencyName')
+            if agency_id and agency_name:
+                agency_names[agency_id] = agency_name
+
+    # Merge PDF content details for each agency
+
+    combined_rows = []
+    header = []
+    for agency_id, agency_name in agency_names.items():
+        pdf_csv = os.path.join(run_dir, f"{agency_id}_pdf_content_details.csv")
+        if os.path.exists(pdf_csv):
+            with open(pdf_csv, mode='r', encoding='utf-8') as f:
+                reader = csv.reader(f)
+                header = next(reader)
+                header = ['agency_name'] + header
+                for row in reader:
+                    combined_rows.append([agency_name] + row)
+        else:
+            logger.warning(f"PDF content details CSV not found for agency ID {agency_id}, skipping...")
+            continue
+
+    # Write out the combined CSV
+    combined_csv = os.path.join(run_dir, f"{date_str}_combined_pdf_content_details.csv")
+    with open(combined_csv, mode='w', newline='', encoding='utf-8') as f:
+        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
+        # Write header: agency_id + original header
+        writer.writerow(header)
+        writer.writerows(combined_rows)
+
+    logger.info(f"Combined PDF content details written to {combined_csv}")
+    # If remove files then remove each file
+    if remove_files:
+        for agency_id, agency_name in agency_names.items():
+            pdf_csv = os.path.join(run_dir, f"{agency_id}_pdf_content_details.csv")
+            json_path = os.path.join(run_dir, f"{agency_id}_pdf_content_details.json")
+            if os.path.exists(pdf_csv):
+                os.remove(pdf_csv)
+                logger.debug(f"Removed file: {pdf_csv}")
+            if os.path.exists(json_path):
+                os.remove(json_path)
+                logger.debug(f"Removed file: {json_path}")
+    return combined_csv
+
+
+def pull_all_agency_metadata(run_dir, overwrite=False, remove_files=True, verbose=False):
+    """
+    Main function to pull all agency metadata and document listings from Michigan API.
+
+    This function orchestrates the complete metadata retrieval process:
+    1. Fetches all agency information
+    2. Saves agency info to JSON and CSV
+    3. For each agency, fetches document metadata
+    4. Merges all document metadata into a combined CSV
+
+    Args:
+        run_dir (str): Directory to save all metadata files
+        overwrite (bool): If False, skip agencies with existing CSV files (default: False)
+        remove_files (bool): Remove individual agency files after merging (default: True)
+        verbose (bool): Enable verbose output (default: False)
+
+    Returns:
+        str: Path to the combined CSV file containing all document metadata
+
+    Raises:
+        RuntimeError: If unable to fetch agency information from API
+    """
+    os.makedirs(run_dir, exist_ok=True)
+
+    # Step 1: Fetch all agency information
+    all_agency_info = get_all_agency_info()
+    if not all_agency_info:
+        raise RuntimeError("Failed to fetch agency information from API")
+
+    if verbose:
+        logger.debug(json.dumps(all_agency_info, indent=2))
+
+    date_str = datetime.now().strftime("%Y-%m-%d")
+
+    # Save complete agency info JSON
+    agency_file = os.path.join(run_dir, f"{date_str}_all_agency_info.json")
+    with open(agency_file, "w", encoding="utf-8") as f:
+        json.dump(all_agency_info, f, indent=2, ensure_ascii=False)
+    logger.info(f"All agency information saved to {agency_file}")
+
+    # Extract agency list
+    agency_list = (
+        all_agency_info.get('returnValue', {})
+        .get('objectData', {})
+        .get('responseResult', [])
+    )
+
+    # Step 2: Save agency info as CSV
+    agency_keep_cols = [
+        "Address", "agencyId", "AgencyName", "AgencyType", "City", "County",
+        "LicenseEffectiveDate", "LicenseeGroupOrganizationName",
+        "LicenseExpirationDate", "LicenseNumber", "LicenseStatus", "Phone", "ZipCode"
+    ]
+
+    agency_csv_file = os.path.join(run_dir, f"{date_str}_agency_info.csv")
+    with open(agency_csv_file, mode='w', newline='', encoding='utf-8') as f:
+        writer = csv.DictWriter(f, fieldnames=agency_keep_cols, quoting=csv.QUOTE_ALL)
+        writer.writeheader()
+        for agency in agency_list:
+            row = {col: agency.get(col, "") for col in agency_keep_cols}
+            writer.writerow(row)
+    logger.info(f"Agency info written to {agency_csv_file}")
+
+    # Step 3: Fetch document metadata for each agency
+    doc_keep_cols = ['FileExtension', 'CreatedDate', 'Title', 'ContentBodyId', 'Id', 'ContentDocumentId']
+
+    for agency in agency_list:
+        record_id = agency.get('agencyId')
+        if not record_id:
+            if verbose:
+                logger.debug(f"Skipping agency with empty ID")
+            continue
+
+        csv_file = os.path.join(run_dir, f"{record_id}_pdf_content_details.csv")
+
+        # Skip if file exists and not overwriting
+        if not overwrite and os.path.exists(csv_file):
+            if verbose:
+                logger.debug(f"Skipping {record_id} (file exists, overwrite=False)")
+            continue
+
+        if verbose:
+            logger.info(f"Processing agency ID: {record_id}")
+
+        pdf_results = get_content_details_method(record_id)
+
+        if pdf_results:
+            # Save full JSON
+            json_file = os.path.join(run_dir, f"{record_id}_pdf_content_details.json")
+            with open(json_file, "w", encoding="utf-8") as jf:
+                json.dump(pdf_results, jf, indent=2, ensure_ascii=False)
+
+            # Save CSV with key fields
+            with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
+                writer = csv.writer(f, quoting=csv.QUOTE_ALL)
+                writer.writerow(['agency_id'] + doc_keep_cols)
+                for p in pdf_results.get('returnValue', {}).get('contentVersionRes', []):
+                    row_data = [record_id] + [p.get(k, "") for k in doc_keep_cols]
+                    writer.writerow(row_data)
+
+            if verbose:
+                logger.debug(f"  Saved document metadata for {record_id}")
+        else:
+            logger.warning(f"Failed to retrieve document details for agency ID: {record_id}")
+
+    # Step 4: Merge all document metadata into single CSV
+    combined_csv = merge_agency_info(agency_csv_file, run_dir, remove_files=remove_files)
+    return combined_csv
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Pull agency metadata and document listings from Michigan Child Welfare API"
+    )
+    parser.add_argument(
+        "--run-dir",
+        dest="run_dir",
+        help="Directory for this run's metadata and artifacts",
+        default="./"
+    )
+    parser.add_argument(
+        "--overwrite",
+        dest="overwrite",
+        action="store_true",
+        help="Overwrite existing files (default: False)"
+    )
+    parser.add_argument(
+        "--remove-files",
+        dest="remove_files",
+        action="store_true",
+        default=True,
+        help="Remove individual agency files after merging (default: True)"
+    )
+    parser.add_argument(
+        "--verbose",
+        action="store_true",
+        help="Enable verbose output"
+    )
+
+    args = parser.parse_args()
+
+    # Configure logging
+    logging.basicConfig(
+        level=logging.DEBUG if args.verbose else logging.INFO,
+        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+    )
+
+    try:
+        combined_csv = pull_all_agency_metadata(
+            run_dir=args.run_dir,
+            overwrite=args.overwrite,
+            remove_files=args.remove_files,
+            verbose=args.verbose
+        )
+        logger.info(f"Success! Combined metadata saved to: {combined_csv}")
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)
\ No newline at end of file
diff --git a/mcyj_download.py b/mcyj_download.py
deleted file mode 100644
index e02de94..0000000
--- a/mcyj_download.py
+++ /dev/null
@@ -1,80 +0,0 @@
-from datetime import datetime
-import os
-import re
-import argparse
-import csv
-
-def file_info_to_filename(agency_id, document_name, document_date):
-    # Convert file information dictionary to a filename string
-
-    document_agency = agency_id.strip().replace(" ", "_").replace("/", "_")
-    document_name = document_name.strip().replace(" ", "_").replace("/", "-")
-
-    return f"{document_agency}_{document_name}_{document_date}.pdf"
-
-def get_output_dir_info(output_dir):
-    """
-    Get the list of files in the output directory and the latest date from filenames.
-
-    Args:
-        output_dir (str): Directory to check for existing files."""
-
-    existing_files = os.listdir(output_dir)
-    if existing_files:
-        pdf_files = [f for f in existing_files if re.match(r'.*\d{4}-\d{2}-\d{2}\.pdf$', f)]
-        # Extract date from filename using regex and find the most recent date
-        all_dates = []
-        for f in pdf_files:
-            match = re.search(r'(\d{4}-\d{2}-\d{2})\.pdf$', f)
-            if match:
-                all_dates.append(match.group(1))
-        # Get the latest date from the list, parsing as YYYY-MM-DD date
-        all_dates = [datetime.strptime(date, '%Y-%m-%d') for date in all_dates]
-        if all_dates:
-            latest_date = max(all_dates).strftime('%Y-%m-%d')
-            print(f"Latest date found in existing files: {latest_date}")
-        else:
-            latest_date = None
-
-    return existing_files, latest_date
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.")
-    parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV and JSON files", default="./")
-    parser.add_argument("--input-file", dest="input_file", help="Path to the input CSV file")
-    args = parser.parse_args()
-    output_dir = args.output_dir
-    input_file = args.input_file
-
-    # Read all of the files in the output directory and get the latest date
-    if not os.path.exists(output_dir):
-        print(f"Output directory {output_dir} does not exist. Creating it.")
-        os.makedirs(output_dir)
-    existing_files, latest_date = get_output_dir_info(output_dir)
-
-    # Read in the input file csv as dictionary
-    if input_file:
-        if os.path.exists(input_file):
-            with open(input_file, 'r') as f:
-                reader = csv.DictReader(f)
-                input_data = [row for row in reader]
-        else:
-            raise FileNotFoundError(f"Input file {input_file} does not exist.")
-
-    for row in input_data:
-        # Parse document_date in "2023-08-22T15:30:32.000Z" format to "YYYY-MM-DD"
-        raw_date = row.get('CreatedDate', '')
-        try:
-            parsed_date = datetime.strptime(raw_date, "%Y-%m-%dT%H:%M:%S.%fZ").strftime("%Y-%m-%d")
-        except (ValueError, TypeError):
-            parsed_date = raw_date  # fallback if parsing fails
-
-        print(file_info_to_filename(
-            agency_id=row.get('agency_id', ''),
-            document_name=row.get('Title', ''),
-            document_date=parsed_date
-        ))
-#    print(input_data)
-    print(existing_files[:5])
-    print(latest_date)
\ No newline at end of file
diff --git a/parse_available_files.py b/parse_available_files.py
deleted file mode 100644
index 4815f74..0000000
--- a/parse_available_files.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Selenium is used for web automation and scraping
-from selenium import webdriver
-from selenium.webdriver.chrome.service import Service
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-from selenium.webdriver.common.keys import Keys
-from selenium.common.exceptions import ElementClickInterceptedException
-# Time utilities for delays and timestamp handling
-import argparse
-import time
-from time import sleep
-# Date and time parsing
-from datetime import datetime
-# File and path operations
-from pathlib import Path
-import os
-import glob
-
-import csv
-import re
-
-## First get the agency URLs (and ids)
-#def get_agency_information():
-def get_agency_information(driver):
-    """
-    Extracts agency information from a web page using Selenium.
-
-    Args:
-        driver (webdriver): The Selenium WebDriver instance.
-
-    Returns:
-        list: A list of dictionaries containing agency information.
-    """
-    driver.get("https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/")
-    time.sleep(5) # Wait for the page to fully load (longer than usual due to dynamic content)
-    # Prepare an empty list to store all agency-specific URLs found across pages
-
-    sub_urls = []
-    table_header = []
-    table_data = []
-    header_elements = driver.find_elements(By.XPATH, "//lightning-datatable//table/thead/tr/th")
-    # Clean up header text by removing "Sort by:" and "Sorted: None"
-    for i, header in enumerate(header_elements):
-        text = header.text.strip()
-        text = text.replace("Sort by:", "").replace("Sorted: None", "").replace("\n", " ").strip()
-        table_header.append(text)
-
-    while True:
-        # Find the license number rows:
-        # Change this to tr OR th
-        table_rows = driver.find_elements(By.XPATH, "//lightning-datatable//table/tbody/tr")
-        for row in table_rows:
-            # Find all columns (td elements) in the row
-            row_data = []
-            columns = row.find_elements(By.XPATH, "./td | ./th")
-            for col in columns:
-                row_data.append(col.text.strip())
-            table_data.append(row_data)
-
-        for row in table_rows:
-            link_elements = row.find_elements(By.XPATH, ".//lightning-formatted-url/a")
-            for link in link_elements:
-                href = link.get_attribute('href')
-                if href:
-                    sub_urls.append(href)
-
-        try:
-            # Try to locate and click the "Next" page button to load the next page of results
-            next_button = driver.find_element(By.XPATH, "//lightning-button-icon[3]/button/lightning-primitive-icon")
-            next_button.click()
-        except ElementClickInterceptedException:
-            # If the click fails (e.g., no more pages or overlay blocking it), stop the loop
-            print("No more pages available.")
-            break
-    return sub_urls, table_header, table_data
-
-def write_agency_information_to_csv(sub_urls, table_header, table_data, output_dir):
-    """
-    Writes the agency information to a CSV file.
-
-    Args:
-        sub_urls (list): List of agency URLs.
-        table_header (list): List of table headers.
-        table_data (list): List of table data rows.
-        output_dir (str): Directory where the CSV file will be saved.
-    """
-    # Ensure output directory exists
-    os.makedirs(output_dir, exist_ok=True)
-
-    # Define the output CSV file path
-    # Add date to the filename to avoid overwriting
-    date_str = datetime.now().strftime("%Y%m%d")
-    csv_filename = f"agency_information_{date_str}.csv"
-    csv_file_path = os.path.join(output_dir, csv_filename)
-
-    # Write to CSV file
-    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
-       writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
-       writer.writerow(table_header)  # Write header
-       writer.writerows(table_data)  # Write data rows
-
-    print(f"Agency information written to {csv_file_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.")
-    parser.add_argument("--driver-path", dest="driver_path", help="Path to the ChromeDriver executable", default=None)
-    parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV file", default="./")
-
-    args = parser.parse_args()
-
-    # Initialize the WebDriver
-    service = Service(args.driver_path)
-    driver = webdriver.Chrome(service=service)
-
-    try:
-        sub_urls, table_header, table_data = get_agency_information(driver)
-        write_agency_information_to_csv(sub_urls, table_header, table_data, args.output_dir)
-    finally:
-        driver.quit()  # Ensure the driver is closed after use
\ No newline at end of file
diff --git a/pull_agency_info_api.py b/pull_agency_info_api.py
deleted file mode 100644
index 5764ff9..0000000
--- a/pull_agency_info_api.py
+++ /dev/null
@@ -1,282 +0,0 @@
-import csv
-import requests
-import json
-import urllib.parse
-import urllib3
-import argparse
-import os
-from datetime import datetime
-
-def get_all_agency_info():
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute"
-
-    params = {
-        "cacheable": "true",
-        "classname": "@udd/01p8z0000009E4V",
-        "isContinuation": "false",
-        "method": "getAgenciesDetail",
-        "namespace": "",
-        "params": json.dumps({"recordId": None}),
-        "language": "en-US",
-        "asGuest": "true",
-        "htmlEncode": "false"
-    }
-
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-        'Accept': 'application/json, text/plain, */*',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
-    }
-
-    try:
-        print("GET request with recordId=null")
-        response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30)
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        print(f"GET request with recordId=null failed: {e}")
-        return None
-
-def get_agency_details(record_id):
-    """
-    GET request with URL parameters directly to the API endpoint
-    """
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute"
-
-    # Build the exact URL from your example
-    params = {
-        "cacheable": "true",
-        "classname": "@udd/01p8z0000009E4V",
-        "isContinuation": "false",
-        "method": "getAgenciesDetail",
-        "namespace": "",
-        "params": json.dumps({"recordId": record_id}),
-        "language": "en-US",
-        "asGuest": "true",
-        "htmlEncode": "false"
-    }
-
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-        'Accept': 'application/json, text/plain, */*',
-        'Accept-Language': 'en-US,en;q=0.9',
-        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
-    }
-
-    try:
-        print("Method 1: GET request with URL parameters")
-        response = requests.get(base_url, params=params, headers=headers, verify=False, timeout=30)
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        print(f"Method 1 failed: {e}")
-        return None
-
-# Get the files
-def get_content_details_method(record_id):
-    """
-    POST with JSON payload directly to the API endpoint
-    """
-    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
-
-    base_url = "https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/webruntime/api/apex/execute"
-
-    # JSON payload
-    payload = {
-        "namespace": "",
-        "classname": "@udd/01p8z0000009E4V",
-        "method": "getContentDetails",
-        "isContinuation": False,
-        "params": {
-            "recordId": record_id
-        },
-        "cacheable": False
-    }
-
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
-        'Accept': 'application/json, text/plain, */*',
-        'Content-Type': 'application/json',
-        'X-Requested-With': 'XMLHttpRequest',
-        'Origin': 'https://michildwelfarepubliclicensingsearch.michigan.gov',
-        'Referer': 'https://michildwelfarepubliclicensingsearch.michigan.gov/licagencysrch/'
-    }
-
-    try:
-        print("POST with JSON payload directly to the API endpoint")
-        print(f"Payload: {json.dumps(payload, indent=2)}")
-
-        response = requests.post(
-            base_url,
-            json=payload,
-            headers=headers,
-            verify=False,
-            timeout=30
-        )
-        response.raise_for_status()
-        return response.json()
-    except Exception as e:
-        print(f"POST with JSON payload directly to the API endpoint failed: {e}")
-        if 'response' in locals():
-            print(f"Response content: {response.text}")
-        return None
-
-def merge_agency_info(agency_csv, output_dir = ".", remove_files=False):
-    """
-    Merges the agency details into the all agency info dictionary.
-    """
-    date_str = datetime.now().strftime("%Y-%m-%d")
-
-    # Build a mapping from agencyId to AgencyName
-    agency_names = {}
-    with open(agency_csv, mode='r', encoding='utf-8') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            agency_id = row.get('agencyId')
-            agency_name = row.get('AgencyName')
-            if agency_id and agency_name:
-                agency_names[agency_id] = agency_name
-
-    # Merge PDF content details for each agency
-
-    combined_rows = []
-    header = []
-    for agency_id, agency_name in agency_names.items():
-        pdf_csv = os.path.join(output_dir, f"{agency_id}_pdf_content_details.csv")
-        if os.path.exists(pdf_csv):
-            with open(pdf_csv, mode='r', encoding='utf-8') as f:
-                reader = csv.reader(f)
-                header = next(reader)
-                header = ['agency_name'] + header
-                for row in reader:
-                    combined_rows.append([agency_name] + row)
-        else:
-            print(f"Warning: PDF content details CSV not found for agency ID {agency_id}, skipping...")
-            continue
-
-    # Write out the combined CSV
-    combined_csv = os.path.join(output_dir, f"{date_str}_combined_pdf_content_details.csv")
-    with open(combined_csv, mode='w', newline='', encoding='utf-8') as f:
-        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
-        # Write header: agency_id + original header
-        writer.writerow(header)
-        writer.writerows(combined_rows)
-
-    print(f"Combined PDF content details written to {combined_csv}")
-    # If remove files then remove each file
-    if remove_files:
-        for agency_id, agency_name in agency_names.items():
-            pdf_csv = os.path.join(output_dir, f"{agency_id}_pdf_content_details.csv")
-            json_path = os.path.join(output_dir, f"{agency_id}_pdf_content_details.json")
-            if os.path.exists(pdf_csv):
-                os.remove(pdf_csv)
-                print(f"Removed file: {pdf_csv}")
-            if os.path.exists(json_path):
-                os.remove(json_path)
-                print(f"Removed file: {json_path}")
-    return combined_csv
-
-# Test the functions
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download Child Welfare Licensing agency PDFs from Michigan's public licensing search.")
-    parser.add_argument("--output-dir", dest="output_dir", help="Directory to save the CSV and JSON files", default="./")
-    parser.add_argument("--overwrite", dest="overwrite", help="Overwrite existing files", default=True)
-    parser.add_argument("--remove-files", dest="remove_files", help="Remove individual agency files after merging", default=True)
-    parser.add_argument("--verbose", dest="verbose", help="Enable verbose output", default=False, action='store_true')
-    args = parser.parse_args()
-    output_dir = args.output_dir
-
-    # # Patch all print statements in functions
-    # builtins.print = lambda *a, **kw: log_print(' '.join(str(x) for x in a), logging.INFO)
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    all_agency_info = get_all_agency_info()
-    print(json.dumps(all_agency_info, indent=2))
-    date_str = datetime.now().strftime("%Y-%m-%d")
-    agency_file = os.path.join(output_dir, f"{date_str}_all_agency_info.json")
-
-    with open(agency_file, "w", encoding="utf-8") as f:
-        json.dump(all_agency_info, f, indent=2, ensure_ascii=False)
-
-    print("All agency information saved to all_agency_info.json")
-
-    # Extract the list from all_agency_info
-    agency_list = (
-        all_agency_info.get('returnValue', {})
-        .get('objectData', {})
-        .get('responseResult', [])
-    )
-
-    # Define the columns to keep
-    keep_cols = [
-        "Address",
-        "agencyId",
-        "AgencyName",
-        "AgencyType",
-        "City",
-        "County",
-        "LicenseEffectiveDate",
-        "LicenseeGroupOrganizationName",
-        "LicenseExpirationDate",
-        "LicenseNumber",
-        "LicenseStatus",
-        "Phone",
-        "ZipCode"
-    ]
-
-    # Update CSV filename to include date
-    agency_csv_file = os.path.join(output_dir, f"{date_str}_agency_info.csv")
-    with open(agency_csv_file, mode='w', newline='', encoding='utf-8') as f:
-        writer = csv.DictWriter(f, fieldnames=keep_cols, quoting=csv.QUOTE_ALL)
-        writer.writeheader()
-        for agency in agency_list:
-            row = {col: agency.get(col, "") for col in keep_cols}
-            writer.writerow(row)
-    print(f"Agency info written to {agency_csv_file}")
-
-    keep_cols = ['FileExtension', 'CreatedDate', 'Title', 'ContentBodyId', 'Id', 'ContentDocumentId']
-
-    # Run for each agency id
-    for agency in agency_list:
-        record_id = agency.get('agencyId')
-        csv_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.csv")
-        if not record_id:
-            print(f"Skipping agency ID {record_id} as it is empty.")
-            continue
-        if args.overwrite and os.path.exists(csv_file):
-            print(f"File {csv_file} already exists and overwrite is enabled, skipping agency ID {record_id}.")
-            continue
-
-        print(f"Processing agency ID: {record_id}")
-        pdf_results = get_content_details_method(record_id)
-
-        if pdf_results:
-            print(f"PDF Content Details for {record_id}:")
-            # print(json.dumps(pdf_results, indent=2))
-            # Save full JSON response to file
-            json_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.json")
-            with open(json_file, "w", encoding="utf-8") as jf:
-                json.dump(pdf_results, jf, indent=2, ensure_ascii=False)
-            print(f"Full JSON results written to {json_file}")
-
-            # Write top-level keys/values to CSV
-            csv_file = os.path.join(output_dir, f"{record_id}_pdf_content_details.csv")
-            with open(csv_file, mode='w', newline='', encoding='utf-8') as f:
-                writer = csv.writer(f, quoting=csv.QUOTE_ALL)
-                # Write the header
-                writer.writerow(['agency_id'] + keep_cols)
-                for p in pdf_results.get('returnValue', {}).get('contentVersionRes', []):
-                    row_data = [record_id] + [p.get(k, "") for k in keep_cols]
-                    writer.writerow(row_data)
-
-            print(f"Top-level JSON results written to {csv_file}")
-        else:
-            print(f"Failed to retrieve PDF content details for agency ID: {record_id}")
-
-    merge_agency_info(agency_csv_file, output_dir, remove_files=args.remove_files)
\ No newline at end of file