Source code for lightwin.scripts.combine_solutions

#!/usr/bin/env python3
"""Take the best compensation settings among several runs.

Basic usage:
``./combine_solutions.py -d /path/to/sim1/ /path/to/sim2/ /path/to/sim3``

This is useful when different strategies are best on different fault scenarios.

The compensation settings are sorted against a column in the
``evaluations.csv`` files.

Reminder on the directory structure and conventions:

simulation_1            <- project folder
    | 000000_ref        <- simulation folder (holds a fault scenario)
        | 0_Envelope1D
        | 1_TraceWin
    | 000001
        | 0_Envelope1D
        | 1_TraceWin
    | ...
    | evaluations.csv
simulation_2
    | 000000_ref
    | 000001
    | ...


"""

import argparse
import os
import shutil
from collections.abc import Collection, Sequence
from pathlib import Path

import pandas as pd


[docs] def combine_bests( paths: Sequence[Path | str], criterion_to_minimize: str = "Lost power over whole linac in W.", out_folder: Path | str = "", copy: bool = False, ) -> None: """Compare several solutions, and concatenate the best one. To determine the best solution for every fault scenario, we open each simulation's ``evaluations.csv``, look at the value of the column ``criterion_to_minimize`` and keep the simulation with the lowest. Parameters ---------- paths : Project folders (where ``evaluations.csv`` and every simulation is). criterion_to_minimize : The ``evaluations.csv`` column against which simulations are compared. The default is ``"Lost power over whole linac in W."`` (we keep simulations with the lowest lost power). out_folder : Where every best simulation folder will be gathered. If not provided, we create a ``combined/`` folder in the last common ancestor of all provided paths. copy : To create hard-copies of the original simulation folders instead of creating a symlink. """ paths = [Path(p).resolve() if isinstance(p, str) else p for p in paths] best_simulation_folders, combined = _select_best_simulations( paths, criterion_to_minimize=criterion_to_minimize ) if not out_folder: out_folder = _infer_an_output_folder(paths) if isinstance(out_folder, str): out_folder = Path(out_folder) if not out_folder.is_dir(): os.mkdir(out_folder) combined.to_csv(out_folder / "evaluations.csv", sep=",") _gather_best_simulations_in_same_place( best_simulation_folders, out_folder, create_symlinks_instead_of_hard_copies=not copy, )
[docs] def _infer_an_output_folder( paths: Sequence[Path], output_folder_name: str = "combined" ) -> Path: """Return output folder in the last common parent of ``paths``.""" assert len(paths) > 0 common_parent = paths[0].parts for path in paths[1:]: common_parent = [ part for part, other_part in zip(common_parent, Path(path).parts) if part == other_part ] output_path = Path(*common_parent) / output_folder_name if not output_path.exists(): output_path.mkdir() return output_path
[docs] def _select_best_simulations( paths: Sequence[Path], criterion_to_minimize: str, ) -> tuple[pd.Series, pd.DataFrame]: """Give the name of the best solution according to ``criterion_to_minimize`` Parameters ---------- paths : Path to project folders to be compared. criterion_to_minimize : The quantity that we want to minimize. It must be a column name in ``evaluations.csv``. Returns ------- best_simulation_folders : For each fault scenario, holds the path to the best simulation folder. combined : A ``evaluations.csv`` where each row holds the values for the best simulation. You must ensure that all ``evaluations.csv`` have the same columns. """ all_df = [ _load_evaluation( simulation_folder, evaluation_namecol=(criterion_to_minimize,), new_name=(str(simulation_folder),), ) for simulation_folder in paths ] criterion_values = pd.concat(all_df, axis=1) best_simulations = criterion_values.idxmin(axis=1) best_simulations.name = "best simulation project folder" n_simulations = len(all_df[0]) folder_names = _reconstruct_folder_names(n_simulations) criterion_values = pd.concat((best_simulations, folder_names), axis=1) best_simulation_folders = [ Path(best_sim) / folder_name for best_sim, folder_name in zip(best_simulations, folder_names) ] all_df = { simulation_folder: _load_evaluation(simulation_folder) for simulation_folder in paths } combined = _concat_evaluations_files(all_df, best_simulations) return best_simulation_folders, combined
[docs] def _reconstruct_folder_names(n_simulations: int) -> pd.Series: """Reconstruct the name of the simulation folders.""" folders = [f"{i:06d}" for i in range(n_simulations)] folders[0] += "_ref" return pd.Series(folders, name="simulation_folder")
[docs] def _concat_evaluations_files( evaluations: dict[str, pd.DataFrame], best_solutions: Sequence[Path], ) -> pd.DataFrame: """Concatenate the evaluations, taking only the best. Parameters ---------- evaluations : Keys are user-defined names for every simulation. Values are corresponding ``evaluations.csv`` files; their columns must be the same. They must have the same indexes. best_solutions : For every calculation, defines the ``evaluation`` we want to keep. The lenght must be the same as the length of every object in ``evaluations``. Contained strings must all be keys of ``evaluations``. Returns ------- Same columns as ``evaluations``. Every row contains the ``evaluations.csv`` row of the best solution. """ best_evaluation_dfs = [] for i, solution in enumerate(best_solutions): solution = Path(solution) if solution not in evaluations: raise ValueError( f"The solution '{solution}' is not found in evaluations." ) best_line = evaluations[solution].iloc[i] best_evaluation_dfs.append(best_line) combined = pd.concat(best_evaluation_dfs, axis=1).T combined["best solutions"] = best_solutions return combined
[docs] def _gather_best_simulations_in_same_place( best_simulation_folders: Collection[Path], out_folder: Path, create_symlinks_instead_of_hard_copies: bool, dirs_exist_ok: bool = True, ) -> None: """Concatenate the best simulation folders in a single place.""" for folder in best_simulation_folders: dst = out_folder / folder.name _copy_or_create_symlink( folder, dst, create_symlinks_instead_of_hard_copies=create_symlinks_instead_of_hard_copies, dirs_exist_ok=dirs_exist_ok, )
[docs] def _load_evaluation( evaluation_folder: Path, evaluation_namecol: Sequence[str] | None = None, new_name: Sequence[str] | None = None, ) -> pd.DataFrame: """Load the file and rename column header. Parameters ---------- evaluation_folder : Folder where the ``evaluations.csv`` file is. evaluation_namecol : Name of the column in the ``evaluations.csv`` for the sorting; if not provided, we keep all the columns. new_name : If provided, the loaded columns will be renamed with this. The length of ``new_name`` must match ``evaluation_namecol``. Returns ------- Holds all the values of ``evaluation_namecol`` in ``evaluation_path``. """ df = pd.read_csv( evaluation_folder / "evaluations.csv", # usecols=evaluation_namecol # type: ignore ) df.columns = df.columns.str.trip() df = df[[evaluation_namecol]] if new_name is not None: assert evaluation_namecol is not None assert len(new_name) == len(evaluation_namecol) new_names = { original_col: new for original_col, new in zip( evaluation_namecol, new_name, strict=True ) } df.rename(columns=new_names, inplace=True) return df
[docs] def main(): parser = argparse.ArgumentParser("combine_solutions") parser.add_argument( "-m", "--minimize", help="Column we want to minimize in the different evaluations.csv. If " + " not provided, we minimize 'Lost power over whole linac in W.'.", type=str, required=False, default="Lost power over whole linac in W.", ) parser.add_argument( "-o", "--outfolder", help="Where the resulting evaluations.csv and corresponding simulation" + " folders should be stored. If not provided, we create a 'combined' " + "folder in the closest ancestor of all provided simulation folders.", type=str, required=False, default="", ) parser.add_argument( "-c", "--copy", help="Flag to copy the whole simulation folders instead of symlinking " + "them.", required=False, action="store_true", ) parser.add_argument( "-d", "--dirs", help="The project folders to compare. They are the folders " + "holding the `evaluations.csv` as well as the `000000_ref`, `000001`" + ", etc folders.", required=True, nargs="+", ) args = parser.parse_args() combine_bests( paths=args.dirs, criterion_to_minimize=args.minimize, out_folder=args.outfolder, copy=args.copy, )
if __name__ == "__main__": main()