#!/usr/bin/env python3 import sys import glob import re from pathlib import Path import pandas as pd def parse_similarity_code(similarity_code: str): """Parse [a,b,c,d,e,f|g,h,i,j]=0.85 into sc1..sc6, sc7..sc9, sc0 and the final value.""" if not similarity_code or pd.isna(similarity_code): return [None] * 10 + [None] # Extract all numbers using regex (handles integers and floats) numbers = re.findall(r'[-+]?\d*\.?\d+', similarity_code) # Expect at least 10 numbers + 1 similarity value, but be flexible values = [float(x) if '.' in x or 'e' in x.lower() else int(x) for x in numbers] # Pad with None if fewer than 10 numbers while len(values) < 10: values.append(None) sc_values = values[:10] # sc1 to sc0 sim_value = values[10] if len(values) > 10 else None return sc_values + [sim_value] def main(): if len(sys.argv) < 2: print("Usage: python parse_anon_maps.py [output_csv]") print("Example: python parse_anon_maps.py ZINC12345") sys.exit(1) base_name = sys.argv[1] output_file = sys.argv[2] if len(sys.argv) > 2 else f"{base_name}_combined_maps.csv" # Find all matching files pattern = f"{base_name}.*.src.txt.anon.map.txt" files = sorted(glob.glob(pattern)) if not files: print(f"No files found matching: {pattern}") return print(f"Found {len(files)} matching files for '{base_name}'") data = [] for filepath in files: path = Path(filepath) parts = path.name.split('.') catalog = parts[1] if len(parts) > 1 and parts[0] == base_name else "unknown" with open(filepath, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line or line.startswith(("#", "//", "@")): continue fields = line.split() if len(fields) >= 5: index_num = fields[0] anon_graph = fields[1] smiles = fields[2] vendor_id = fields[3] similarity_code = " ".join(fields[4:]) # Parse the similarity code parsed = parse_similarity_code(similarity_code) sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc0, similarity_value = parsed data.append({ "base_name": base_name, "catalog": catalog, "index": index_num, "anonymous_graph": anon_graph, "smiles": smiles, "vendor_id": vendor_id, "similarity_code": similarity_code, # keep original for reference "sc1": sc1, "sc2": sc2, "sc3": sc3, "sc4": sc4, "sc5": sc5, "sc6": sc6, "sc7": sc7, "sc8": sc8, "sc9": sc9, "sc0": sc0, "similarity_value": similarity_value, "source_file": path.name, }) else: print(f"Warning: Skipping malformed line {line_num} in {path.name}") if not data: print("No valid data parsed.") return df = pd.DataFrame(data) # Optional: convert numeric columns to proper types sc_cols = ['sc1','sc2','sc3','sc4','sc5','sc6','sc7','sc8','sc9','sc0', 'similarity_value'] for col in sc_cols: df[col] = pd.to_numeric(df[col], errors='coerce') # Save to CSV df.to_csv(output_file, index=False) print(f"\nSuccess! Parsed {len(df):,} entries from {len(files)} catalogs.") print(f" Output saved to: {output_file}") print(f"\nColumns include: sc1 to sc9, sc0, and similarity_value") print("\nSample data:") print(df.head(8)[['catalog', 'vendor_id', 'sc1','sc2','sc3','sc4','sc5','sc6', 'sc7','sc8','sc9','sc0', 'similarity_value']].to_string(index=False)) # Quick stats print(f"\nUnique catalogs: {df['catalog'].nunique()}") print(f"Unique SMILES: {df['smiles'].nunique()}") if __name__ == "__main__": main()