cernopendata · d0leh · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024 · Jul 23, 2024
diff --git a/.gitignore b/.gitignore
@@ -82,3 +82,13 @@ jade-2023-raw-datasets/outputs/*.json
 opera-2017-multiplicity-studies/outputs/opera-events.json
 opera-2019-electron-neutrinos/outputs/opera-events.json
 opera-2019-neutrino-induced-charm/outputs/opera-events.json
+cms-2017-simulated-datasets/cookies.txt
+cms-2017-simulated-datasets/records.json
+cms-2017-simulated-datasets/outputs
+cms-2017-simulated-datasets/lhe_generators
+cms-2017-simulated-datasets/inputs/config-store
+cms-2017-simulated-datasets/inputs/mcm-store
+cms-2017-simulated-datasets/inputs/das-json-store
+cms-2017-simulated-datasets/inputs/parent_dicts.py
+cms-2017-simulated-datasets/code/tar_check
+
diff --git a/cms-2016-simulated-datasets/code/das_json_store.py b/cms-2016-simulated-datasets/code/das_json_store.py
@@ -1,3 +1,4 @@
+#das json
 import json
 import os
 import subprocess
@@ -40,13 +41,13 @@ def get_das_store_json(dataset, query='dataset', das_dir=''):
 
 def mydasgoclient(dataset, query, out_dir, out_file):
     "Interface to dasgoclient"
-   
+
     out = out_dir + '/' + query + '/' + out_file
     if  os.path.exists(out) and os.stat(out).st_size != 0:
         print('==> {:<9} {}'.format(query, dataset) +
             '\n==> File already exist, skipping...\n')
         return
-   
+
     print('\t{:<9} {}'.format(query, dataset))
 
     cmd = 'dasgoclient -query "'
@@ -81,7 +82,8 @@ def create(dataset, das_dir):
 def main(das_dir,
          eos_dir,
          datasets,
-         ignore_eos_store):
+         ignore_eos_store,
+         threads):
     "Do the job."
 
     # create dirs for dataset and release
@@ -98,13 +100,15 @@ def main(das_dir,
         eos_datasets = check_datasets_in_eos_dir(datasets, eos_dir)
 
     total = len(eos_datasets)
+    if threads < total: # if threads is less than total datasets, use threads
+        threads = total
     i = 1
     for dataset in eos_datasets:
         print("dasgoclienting ({}/{})".format(i, total), dataset)
         t = threading.Thread(target=create, args=(dataset, das_dir))
         t.start()
-        while threading.activeCount() >= 100 :
-            sleep(0.5)  # run 100 dasgoclient commands in parallel 
+        while threading.activeCount() >= threads :
+            sleep(0.5)  # run 100 dasgoclient commands in parallel
         i += 1
 
 

diff --git a/cms-2016-simulated-datasets/code/dataset_records.py b/cms-2016-simulated-datasets/code/dataset_records.py
@@ -1,3 +1,4 @@
+#dataset records
 #!/usr/bin/env python
 
 
@@ -115,7 +116,7 @@ def get_dataset(dataset_full_name):
 
 def get_dataset_version(dataset_full_name):
     "Return dataset version from dataset full name."
-    return re.search(r'^.*RunIISummer20UL16.*?-(.*)/(MINI|NANO)AODSIM$', dataset_full_name).groups()[0]
+    return re.search(r'^.*RunIISummer20UL17.*?-(.*)/(MINI|NANO)AODSIM$', dataset_full_name).groups()[0]
 
 
 def get_dataset_index_files(dataset_full_name, eos_dir):
@@ -314,8 +315,9 @@ def populate_mininanorelation_cache(dataset_full_names, mcm_dir):
 
 def get_dataset_semantics_doc(dataset_name, sample_file_path, recid):
     """Produce the dataset semantics files and return their data-curation paths for the given dataset."""
-    output_dir = f"outputs/docs/NanoAODSIM/{recid}"
-    eos_dir = f"/eos/opendata/cms/dataset-semantics/NanoAODSIM/{recid}"
+    recid_rounded = int(recid)//1000 * 1000
+    output_dir = f"outputs/docs/NanoAODSIM/{recid_rounded}/{recid}"
+    eos_dir = f"/eos/opendata/cms/dataset-semantics/NanoAODSIM/{recid_rounded}/{recid}"
     isExist = os.path.exists(output_dir)
     if not isExist:
         os.makedirs(output_dir)
@@ -392,7 +394,7 @@ def create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm
 
     rec_files = get_dataset_index_files(dataset_full_name, eos_dir)
     if rec_files:
-        rec['files'] = [] 
+        rec['files'] = []
         for index_type in ['.json', '.txt']:
             index_files = [f for f in rec_files if f[0].endswith(index_type)]
             for file_number, (file_uri, file_size, file_checksum) in enumerate(index_files):
@@ -545,7 +547,7 @@ def create(dataset, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_d
 
 
 
-def create_records(dataset_full_names, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir):
+def create_records(dataset_full_names, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir, threads):
     """Create records."""
 
     recid_info = {}
@@ -555,13 +557,16 @@ def create_records(dataset_full_names, doi_file, recid_file, eos_dir, das_dir, m
 
     doi_info = populate_doiinfo(doi_file)
 
+    if threads > len(dataset_full_names): # if threads is less than the number of datasets, make threads = number of datasets
+        threads = len(dataset_full_names)
     records = []
     for dataset_full_name in dataset_full_names:
+
         #2016: comment out threading for debugging
         t= threading.Thread(target=create, args=(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir))
         t.start()
-        while threading.activeCount() >= 20 :
-            sleep(0.5)  # run 20 parallel
+        while threading.activeCount() >= threads :
+            sleep(0.5)  # run threads parallel
 
         #records.append(create_record(dataset_full_name, doi_info, recid_info, eos_dir, das_dir, mcm_dir, conffiles_dir))
     #return records
@@ -580,7 +585,7 @@ def print_records(records):
     print(']')
 
 
-def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir, doi_file, recid_file):
+def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir, doi_file, recid_file, threads):
     "Do the job."
 
     populate_containerimages_cache()
@@ -589,7 +594,7 @@ def main(datasets, eos_dir, das_dir, mcm_dir, conffiles_dir, doi_file, recid_fil
     records_dir= "./outputs/records-" + dt.now().strftime("%Y-%m")
     os.makedirs(records_dir, exist_ok=True)
 
-    create_records(datasets, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir)
+    create_records(datasets, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir, records_dir, threads)
 
     #records = create_records(datasets, doi_file, recid_file, eos_dir, das_dir, mcm_dir, conffiles_dir)
     #json.dump(records, indent=2, sort_keys=True, ensure_ascii=True, fp=sys.stdout)
@@ -602,17 +607,17 @@ def get_step_generator_parameters(dataset, mcm_dir, recid, force_lhe=0):
         if mcdb_id > 1:
             print("Got mcdb > 1: " + str(mcdb_id))
             configuration_files['title'] = 'Generator parameters'
-            configuration_files['url'] = "/eos/opendata/cms/lhe_generators/2016-sim/mcdb/{mcdb_id}_header.txt".format(mcdb_id=mcdb_id)
+            configuration_files['url'] = "/eos/opendata/cms/lhe_generators/2017-sim/mcdb/{mcdb_id}_header.txt".format(mcdb_id=mcdb_id)
             return [configuration_files]
         else:
-            dir='./lhe_generators/2016-sim/gridpacks/' + str(recid) + '/'
+            dir='./lhe_generators/2017-sim/gridpacks/' + str(recid) + '/'
             files = []
             files = [f for f in os.listdir(dir) if os.path.isfile(os.path.join(dir, f))]
             confarr=[]
             for f in files:
                 if f != 'LOG.txt':
                     configuration_files['title'] = 'Generator parameters: ' + f
-                    configuration_files['url'] = '/eos/opendata/cms/lhe_generators/2016-sim/gridpacks/' + str(recid) + '/'  + f
+                    configuration_files['url'] = '/eos/opendata/cms/lhe_generators/2017-sim/gridpacks/' + str(recid) + '/'  + f
                     confarr.append(configuration_files.copy())
             dirs = [d for d in os.listdir(dir) if os.path.isdir(os.path.join(dir, d))]
             subdir_contains = ['InputCards', '_JHUGen']
@@ -624,7 +629,7 @@ def get_step_generator_parameters(dataset, mcm_dir, recid, force_lhe=0):
                     for f in files:
                         if f != 'LOG.txt':
                             configuration_files['title'] = 'Generator parameters: ' + f
-                            configuration_files['url'] = '/eos/opendata/cms/lhe_generators/2016-sim/gridpacks/' + str(recid) + '/' + d + '/'  + f
+                            configuration_files['url'] = '/eos/opendata/cms/lhe_generators/2017-sim/gridpacks/' + str(recid) + '/' + d + '/'  + f
                             confarr.append(configuration_files.copy())
             return confarr
     else:

diff --git a/cms-2016-simulated-datasets/code/interface.py b/cms-2016-simulated-datasets/code/interface.py
@@ -1,3 +1,4 @@
+# interface
 #!/usr/bin/env python
 
 """
@@ -57,6 +58,12 @@
 @click.option('--doi-file', default='./inputs/doi-sim.txt',
               show_default=True, type=click.Path(),
               help='File with DOI information')
+@click.option('--threads', default=20, show_default=True,
+              help='Number of threads to use')
+@click.option('--lhe-generators', default=False,
+              show_default=True, is_flag=True,
+              help='Create LHE generators.')
+
 def main(dataset_list,
          create_eos_indexes, eos_dir, ignore_eos_store,
          create_das_json_store, das_dir,
@@ -65,7 +72,9 @@ def main(dataset_list,
          print_categorisation, print_results,
          create_records,
          create_conffile_records,
-         recid_file, doi_file):
+         recid_file, doi_file, threads,
+         lhe_generators
+         ):
     """
     Interface for manipulation of dataset records for OpenData portal.
 
@@ -136,6 +145,16 @@ def main(dataset_list,
 
         $ python ./code/interface.py --print-categorisation DATASET_LIST > categorisation.md
     """
+
+    if threads > 1000:
+        print("Thread number cannot exceed 1000. To modify this limit, change the code of interface.py.")
+        exit()
+
+    if threads > 100:
+        proceed = input("Thread number exceeds 100. Do you want to proceed? (y/n): ")
+        if proceed.lower() != 'y':
+            exit()
+
     datasets = get_datasets_from_dir(dataset_list)
 
     if create_eos_indexes:
@@ -155,11 +174,11 @@ def main(dataset_list,
             print('Did you forget to "voms-proxy-init -voms cms -rfc"?')
         else:
             import das_json_store
-            das_json_store.main(das_dir, eos_dir, datasets, ignore_eos_store)
+            das_json_store.main(das_dir, eos_dir, datasets, ignore_eos_store, threads)
 
     if create_mcm_store:
         import mcm_store
-        mcm_store.create(datasets, mcm_dir, eos_dir, ignore_eos_store)
+        mcm_store.create(datasets, mcm_dir, eos_dir, threads, ignore_eos_store)
 
     if get_conf_files:
         # check if user has key and cert
@@ -182,12 +201,16 @@ def main(dataset_list,
 
     if create_records:
         import dataset_records
-        dataset_records.main(datasets, eos_dir, das_dir, mcm_dir, conf_dir, doi_file, recid_file)
+        dataset_records.main(datasets, eos_dir, das_dir, mcm_dir, conf_dir, doi_file, recid_file, threads)
 
     if create_conffile_records:
         import conffiles_records
         conffiles_records.main(datasets, eos_dir, das_dir, mcm_dir, conf_dir)
 
+    if lhe_generators:
+        import lhe_generators
+        lhe_generators.main(threads)
+
 
 if __name__ == '__main__':
     main()