"""Command line program to reformat enumeration files into tranches.
Nicholas Hadler (nhadler@berkeley.edu) 2023
"""

import os

import pandas as pd  # type: ignore
import typer
from tqdm import tqdm  # type: ignore

app = typer.Typer()


@app.command(help="Reformats enumeration smi files to tranched text files.")
def make_tranches(input_dir: str, output_dir: str) -> None:
    """Reformats enumeration smi files to tranched text files.

    More specifically, this splits molecules in a library based on their
    heavy atom count and their log P so they can registered with the Zinc
    database.

    Example:
        > python make_tranches.py <input_dir> <output_dir>

        Molecule has a tranche of H21P310. It'll be appended to a text file
        "H21P310.txt" inside a "H21" folder.

    Args:
        input_dir (str): Directory containing library to tranche.
        output_dir (str): Directory to output tranched files.
    """
    for smi in tqdm(os.listdir(input_dir)):

        dir = input_dir + smi
        data = pd.read_csv(
            dir,
            header=None,
            delim_whitespace=True,
        )

        # Creates a dictionary of each tranche found in data
        tranches = data[2].unique()
        DataFrameDict = {elem: pd.DataFrame() for elem in tranches}

        for key in DataFrameDict.keys():

            # Populate dictionary with all molecules contained in tranche
            DataFrameDict[key] = data[:][data[2] == key]

            PATH = f"{output_dir}{key[:3]}"

            if not os.path.exists(PATH):
                os.makedirs(PATH)

            DataFrameDict[key][[1, 0]].to_csv(
                f"{PATH}/{key}.txt", index=None, header=None, sep=" ", mode="a"
            )


if __name__ == "__main__":
    app()