#!/usr/bin/env python3 import sys import glob import re from pathlib import Path import pandas as pd def parse_similarity_code(similarity_code: str): """Parse strings like [n,n,n,n,n,n|n,n,n]=n First 9+1 (sc1..sc9,sc0) are integers, final value after = can be float. """ if not similarity_code or pd.isna(similarity_code): return [None] * 10 + [None] # Extract all numbers numbers = re.findall(r'[-+]?\d*\.?\d+', similarity_code) if len(numbers) < 10: return [None] * 10 + [None] # First 10 values → must be integers (sc1 to sc0) sc_values = [] for x in numbers[:10]: try: sc_values.append(int(x)) except ValueError: sc_values.append(None) # Final value after '=' can be float try: sim_value = float(numbers[10]) if len(numbers) > 10 else None except (ValueError, IndexError): sim_value = None while len(sc_values) < 10: sc_values.append(None) return sc_values + [sim_value] def main(): if len(sys.argv) < 2: print("Usage: python script2a.py [output_csv]") print("Example: python script2a.py ZINC12345") sys.exit(1) base_name = sys.argv[1] output_file = sys.argv[2] if len(sys.argv) > 2 else f"{base_name}_combined_maps.csv" pattern = f"{base_name}.*.txt" files = sorted(glob.glob(pattern)) if not files: print(f"No files found matching: {pattern}") return print(f"Found {len(files)} matching files for '{base_name}'") data = [] for filepath in files: path = Path(filepath) parts = path.name.split('.') catalog = parts[1] if len(parts) > 1 and parts[0] == base_name else "unknown" with open(filepath, "r", encoding="utf-8") as f: for line_num, line in enumerate(f, 1): line = line.strip() if not line or line.startswith(("#", "//", "@")): continue fields = line.split() if len(fields) >= 5: index_num = fields[0] anon_graph = fields[1] smiles = fields[2] vendor_id = fields[3] similarity_code = " ".join(fields[4:]) parsed = parse_similarity_code(similarity_code) sc1, sc2, sc3, sc4, sc5, sc6, sc7, sc8, sc9, sc0, similarity_value = parsed data.append({ "base_name": base_name, "catalog": catalog, "index": index_num, "anonymous_graph": anon_graph, "smiles": smiles, "vendor_id": vendor_id, "similarity_code": similarity_code, "sc1": sc1, "sc2": sc2, "sc3": sc3, "sc4": sc4, "sc5": sc5, "sc6": sc6, "sc7": sc7, "sc8": sc8, "sc9": sc9, "sc0": sc0, "similarity_value": similarity_value, "source_file": path.name, }) else: print(f"Warning: Skipping malformed line {line_num} in {path.name}") if not data: print("No valid data parsed.") return df = pd.DataFrame(data) # Ensure proper dtypes for col in ['sc1','sc2','sc3','sc4','sc5','sc6','sc7','sc8','sc9','sc0']: df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64') # nullable integer df['similarity_value'] = pd.to_numeric(df['similarity_value'], errors='coerce') df.to_csv(output_file, index=False) print(f"\nSuccess! Parsed {len(df):,} entries from {len(files)} catalogs.") print(f" Output saved to: {output_file}") print("\nSample data:") print(df.head(8)[['catalog', 'vendor_id', 'sc1','sc2','sc3','sc4','sc5','sc6', 'sc7','sc8','sc9','sc0', 'similarity_value']].to_string(index=False)) print(f"\nUnique catalogs: {df['catalog'].nunique()}") print(f"Unique SMILES: {df['smiles'].nunique()}") if __name__ == "__main__": main()