-
Notifications
You must be signed in to change notification settings - Fork 789
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #3802 from lviliani/master
Add dataset name checking script
- Loading branch information
Showing
2 changed files
with
148 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
import re | ||
|
||
def validate_block(name, pattern, block_name): | ||
""" Helper function to validate individual blocks and provide feedback """ | ||
if block_name=="ME-PS": | ||
match = re.fullmatch(pattern, name) | ||
if not match: | ||
return False, "Invalid ME-PS format" | ||
|
||
me = match.group(1) | ||
ps = match.group(3) # Only available in ME-PS combinations | ||
|
||
if ps and me == ps: | ||
return False, "ME and PS cannot be the same" | ||
|
||
return True, "Valid ME-PS block" | ||
|
||
else: | ||
if re.match(pattern, name): | ||
return True, f"{block_name} block is valid." | ||
else: | ||
return False, f"Invalid {block_name} block." | ||
|
||
def validate_dataset_name(dataset_name): | ||
# Define regex patterns for different blocks | ||
process_pattern = r".*" # PROCESS is mandatory | ||
binning_pattern = r"Bin-[\w-]+" # BINNING is optional | ||
filter_pattern = r"Fil-[\w-]+" # FILTER is optional | ||
param_pattern = r"Par-[\w-]+" # PARAMETERS is optional | ||
tune_pattern = r"TuneCP[1-5]" # TUNE is mandatory (TuneCP1 to TuneCP5) | ||
beame_pattern = r"13p6TeV|\d+TeV|\d+GeV" # BEAME is mandatory | ||
me_ps_pattern = ( | ||
r"(pythia6|pythia8|pythia8-evtgen|herwig6|herwigpp|herwig7|sherpa|" | ||
r"(madgraph|madgraphMLM|amcatnloFXFX|madgraph-madspin|madgraphMLM-madspin|" | ||
r"amcatnloFXFX-madspin|amcatnlo|amcatnlo-madspin|alpgen|mcatnlo|powheg|" | ||
r"powheg-madspin|powheg-JHUGenV\d*|powheg-minlo|powheg-minnlo|powheg-minlo-JHUGenV\d*|" | ||
r"powheg-minnlo-JHUGen\d*|JHUGen|hardcol|bcvegpy2)" | ||
r"-(pythia6|pythia8|herwig6|herwigpp|herwig7))" | ||
) | ||
blocks = dataset_name.split('_') | ||
|
||
feedback = [] | ||
valid = True | ||
|
||
# Step-by-step validation of blocks: | ||
|
||
# Validate PROCESS (first block is mandatory) | ||
if len(blocks) >= 1: | ||
process_match, msg = validate_block(blocks[0], process_pattern, "PROCESS") | ||
feedback.append(msg) | ||
valid &= process_match | ||
else: | ||
feedback.append("Missing PROCESS block.") | ||
valid = False | ||
|
||
current_index = 1 | ||
|
||
# Validate BINNING (optional, second block if it starts with 'Bin-') | ||
if len(blocks) > current_index and blocks[current_index].startswith('Bin-'): | ||
binning_match, msg = validate_block(blocks[current_index], binning_pattern, "BINNING") | ||
feedback.append(msg) | ||
valid &= binning_match | ||
current_index += 1 | ||
else: | ||
feedback.append("BINNING block is missing or optional.") | ||
|
||
# Validate FILTER (optional, next block if it starts with 'Fil-') | ||
if len(blocks) > current_index and blocks[current_index].startswith('Fil-'): | ||
filter_match, msg = validate_block(blocks[current_index], filter_pattern, "FILTER") | ||
feedback.append(msg) | ||
valid &= filter_match | ||
current_index += 1 | ||
else: | ||
feedback.append("FILTER block is missing or optional.") | ||
|
||
# Validate PARAMETERS (optional, next block if it starts with 'Par-') | ||
if len(blocks) > current_index and blocks[current_index].startswith('Par-'): | ||
param_match, msg = validate_block(blocks[current_index], param_pattern, "PARAMETERS") | ||
feedback.append(msg) | ||
valid &= param_match | ||
current_index += 1 | ||
else: | ||
feedback.append("PARAMETERS block is missing or optional.") | ||
|
||
# Validate TUNE (mandatory) | ||
if len(blocks) > current_index: | ||
tune_match, msg = validate_block(blocks[current_index], tune_pattern, "TUNE") | ||
feedback.append(msg) | ||
valid &= tune_match | ||
current_index += 1 | ||
else: | ||
feedback.append("Missing TUNE block.") | ||
valid = False | ||
|
||
# Validate BEAME (mandatory) | ||
if len(blocks) > current_index: | ||
beame_match, msg = validate_block(blocks[current_index], beame_pattern, "BEAME") | ||
feedback.append(msg) | ||
valid &= beame_match | ||
current_index += 1 | ||
else: | ||
feedback.append("Missing BEAME block.") | ||
valid = False | ||
|
||
# Validate ME-PS (mandatory) | ||
if len(blocks) > current_index: | ||
me_ps_match, msg = validate_block(blocks[current_index], me_ps_pattern, "ME-PS") | ||
feedback.append(msg) | ||
valid &= me_ps_match | ||
else: | ||
feedback.append("Missing ME-PS block.") | ||
valid = False | ||
|
||
# Return feedback and validity status | ||
if valid: | ||
return True, "Valid dataset name", feedback | ||
else: | ||
return False, "Invalid dataset name", feedback | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
from check_dataset_names import * | ||
|
||
test_cases = ["DYto2L-4Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-4Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-2Jets_Bin-MLL-4to10_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", | ||
"DYto2L-2Jets_Bin-MLL-10to50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", | ||
"DYto2L-4Jets_Bin-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-2Jets_Bin-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", | ||
"DYto2L-4Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-4Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-4Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-4Jets_Bin-3J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-4Jets_Bin-4J-MLL-50_TuneCP5_13p6TeV_madgraphMLM-pythia8", | ||
"DYto2L-2Jets_Bin-0J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", | ||
"DYto2L-2Jets_Bin-1J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", | ||
"DYto2L-2Jets_2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8", #THIS IS NOT A VALID NAME | ||
"DYto2L-2Jets_Bin-2J-MLL-50_TuneCP5_13p6TeV_amcatnloFXFX-pythia8" | ||
] | ||
|
||
# Validate the test cases | ||
for name in test_cases: | ||
valid, message, feedback = validate_dataset_name(name) | ||
print(f"Dataset: {name} -> {message}") | ||
if not valid: | ||
for item in feedback: | ||
print(f" - {item}") | ||
|
||
print("\n") | ||
|