#!/usr/bin/env python3
# parse for one molecule

import sys
import glob
from pathlib import Path
import pandas as pd

def main():
    if len(sys.argv) < 2:
        print("Usage: python parse_anon_maps.py <base_name> [output_csv]")
        print("Example: python parse_anon_maps.py ZINC12345")
        print("   (optional second argument = output filename)")
        sys.exit(1)

    base_name = sys.argv[1]
    output_file = sys.argv[2] if len(sys.argv) > 2 else f"{base_name}_combined_maps.csv"

    # Find all files matching the pattern for this ${name}
    pattern = f"{base_name}.*.src.txt.anon.map.txt"
    files = sorted(glob.glob(pattern))

    if not files:
        print(f"No files found matching: {pattern}")
        print("   Make sure you are running the script in the directory containing the files.")
        return

    print(f"Found {len(files)} matching files for '{base_name}'")

    data = []

    for filepath in files:
        path = Path(filepath)
        # Extract the {catalog} part from the filename
        # Filename format: ${name}.{catalog}.src.txt.anon.map.txt
        parts = path.name.split('.')
        catalog = parts[1] if len(parts) > 1 and parts[0] == base_name else "unknown"

        with open(filepath, "r", encoding="utf-8") as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line or line.startswith(("#", "//", "@")):
                    continue  # skip empty lines, comments, or possible headers

                fields = line.split()

                if len(fields) >= 5:
                    index_num = fields[0]
                    anon_graph = fields[1]
                    smiles = fields[2]
                    vendor_id = fields[3]
                    # The similarity field may contain spaces (e.g. "Tanimoto=0.85" or a longer code),
                    # so we join everything after the first four fields.
                    similarity_code = " ".join(fields[4:])

                    data.append({
                        "base_name": base_name,
                        "catalog": catalog,
                        "index": index_num,
                        "anonymous_graph": anon_graph,
                        "smiles": smiles,
                        "vendor_id": vendor_id,
                        "similarity_code": similarity_code,
                        "source_file": path.name,
                    })
                else:
                    print(f"Warning: Skipping malformed line {line_num} in {path.name} "
                          f"(only {len(fields)} fields): {line[:120]}...")

    if not data:
        print("No valid data was parsed from any file.")
        return

    # Combine everything into a clean DataFrame
    df = pd.DataFrame(data)

    # Save to CSV (easy to open in Excel, pandas, etc.)
    df.to_csv(output_file, index=False)

    print(f"Success! Parsed {len(df):,} total entries from {len(files)} catalogs.")
    print(f"   Data saved to: {output_file}")
    print(f"   Columns: {list(df.columns)}")
    print("\nSample of the combined data:")
    print(df.head(10).to_string(index=False))

    # Quick stats
    print(f"\nUnique catalogs: {df['catalog'].nunique()}")
    print(f"Unique SMILES:   {df['smiles'].nunique()}")
    print(f"Unique vendor_ids: {df['vendor_id'].nunique()}")


if __name__ == "__main__":
    main()