"""Preprocess MLIP training data."""from__future__importannotationsfromcollections.abcimportSequencefromtypingimportAnyfrommace.cli.preprocess_dataimportrunfrommace.toolsimportbuild_preprocess_arg_parserasmace_parserimportyamlfromjanus_core.helpers.janus_typesimportPathLikefromjanus_core.helpers.logimportconfig_logger,config_trackerfromjanus_core.helpers.utilsimportcheck_files_exist,none_to_dict
[docs]defpreprocess(mlip_config:PathLike,req_file_keys:Sequence[PathLike]=("train_file","test_file","valid_file"),attach_logger:bool=False,log_kwargs:dict[str,Any]|None=None,track_carbon:bool=True,tracker_kwargs:dict[str,Any]|None=None,)->None:""" Convert training data to hdf5 by passing a configuration file to the MLIP's CLI. Currently only supports MACE models, but this can be extended by replacing the argument parsing. Parameters ---------- mlip_config : PathLike Configuration file to pass to MLIP. req_file_keys : Sequence[PathLike] List of files that must exist if defined in the configuration file. Default is ("train_file", "test_file", "valid_file"). attach_logger : bool Whether to attach a logger. Default is False. log_kwargs : dict[str, Any] | None Keyword arguments to pass to `config_logger`. Default is {}. track_carbon : bool Whether to track carbon emissions of calculation. Default is True. tracker_kwargs : dict[str, Any] | None Keyword arguments to pass to `config_tracker`. Default is {}. """log_kwargs,tracker_kwargs=none_to_dict(log_kwargs,tracker_kwargs)# Validate inputswithopen(mlip_config,encoding="utf8")asfile:options=yaml.safe_load(file)check_files_exist(options,req_file_keys)# Configure loggingifattach_logger:log_kwargs.setdefault("filename","preprocess-log.yml")log_kwargs.setdefault("name",__name__)logger=config_logger(**log_kwargs)tracker=config_tracker(logger,track_carbon,**tracker_kwargs)ifloggerand"foundation_model"inoptions:logger.info("Fine tuning model: %s",options["foundation_model"])# Parse options from config, as MACE cannot read config file yetargs=[]forkey,valueinoptions.items():ifisinstance(value,bool):ifvalueisTrue:args.append(f"--{key}")else:args.append(f"--{key}")args.append(f"{value}")mlip_args=mace_parser().parse_args(args)iflogger:logger.info("Starting preprocessing")iftracker:tracker.start_task("Preprocessing")run(mlip_args)iflogger:logger.info("Preprocessing complete")iftracker:tracker.stop_task()tracker.stop()