"""Command line program to reformat enumeration files into tranches. Nicholas Hadler (nhadler@berkeley.edu) 2023 """ import os import pandas as pd # type: ignore import typer from tqdm import tqdm # type: ignore app = typer.Typer() @app.command(help="Reformats enumeration smi files to tranched text files.") def make_tranches(input_dir: str, output_dir: str) -> None: """Reformats enumeration smi files to tranched text files. More specifically, this splits molecules in a library based on their heavy atom count and their log P so they can registered with the Zinc database. Example: > python make_tranches.py Molecule has a tranche of H21P310. It'll be appended to a text file "H21P310.txt" inside a "H21" folder. Args: input_dir (str): Directory containing library to tranche. output_dir (str): Directory to output tranched files. """ for smi in tqdm(os.listdir(input_dir)): dir = input_dir + smi data = pd.read_csv( dir, header=None, delim_whitespace=True, ) # Creates a dictionary of each tranche found in data tranches = data[2].unique() DataFrameDict = {elem: pd.DataFrame() for elem in tranches} for key in DataFrameDict.keys(): # Populate dictionary with all molecules contained in tranche DataFrameDict[key] = data[:][data[2] == key] PATH = f"{output_dir}{key[:3]}" if not os.path.exists(PATH): os.makedirs(PATH) DataFrameDict[key][[1, 0]].to_csv( f"{PATH}/{key}.txt", index=None, header=None, sep=" ", mode="a" ) if __name__ == "__main__": app()