#!/usr/bin/env python3 # parse for one molecule import sys import glob from pathlib import Path import pandas as pd def main(): if len(sys.argv) < 2: print("Usage: python parse_anon_maps.py [output_csv]") print("Example: python parse_anon_maps.py ZINC12345") print(" (optional second argument = output filename)") sys.exit(1) base_name = sys.argv[1] output_file = sys.argv[2] if len(sys.argv) > 2 else f"{base_name}_combined_maps.csv" # Find all files matching the pattern for this ${name} pattern = f"{base_name}.*.src.txt.anon.map.txt" files = sorted(glob.glob(pattern)) if not files: print(f"No files found matching: {pattern}") print(" Make sure you are running the script in the directory containing the files.") return print(f"Found {len(files)} matching files for '{base_name}'") data = [] for filepath in files: path = Path(filepath) # Extract the {catalog} part from the filename # Filename format: ${name}.{catalog}.src.txt.anon.map.txt parts = path.name.split('.') catalog = parts[1] if len(parts) > 1 and parts[0] == base_name else "unknown" with open(filepath, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line or line.startswith(("#", "//", "@")): continue # skip empty lines, comments, or possible headers fields = line.split() if len(fields) >= 5: index_num = fields[0] anon_graph = fields[1] smiles = fields[2] vendor_id = fields[3] # The similarity field may contain spaces (e.g. "Tanimoto=0.85" or a longer code), # so we join everything after the first four fields. similarity_code = " ".join(fields[4:]) data.append({ "base_name": base_name, "catalog": catalog, "index": index_num, "anonymous_graph": anon_graph, "smiles": smiles, "vendor_id": vendor_id, "similarity_code": similarity_code, "source_file": path.name, }) else: print(f"Warning: Skipping malformed line {line_num} in {path.name} " f"(only {len(fields)} fields): {line[:120]}...") if not data: print("No valid data was parsed from any file.") return # Combine everything into a clean DataFrame df = pd.DataFrame(data) # Save to CSV (easy to open in Excel, pandas, etc.) df.to_csv(output_file, index=False) print(f"Success! Parsed {len(df):,} total entries from {len(files)} catalogs.") print(f" Data saved to: {output_file}") print(f" Columns: {list(df.columns)}") print("\nSample of the combined data:") print(df.head(10).to_string(index=False)) # Quick stats print(f"\nUnique catalogs: {df['catalog'].nunique()}") print(f"Unique SMILES: {df['smiles'].nunique()}") print(f"Unique vendor_ids: {df['vendor_id'].nunique()}") if __name__ == "__main__": main()