Skip to content

Commit

Permalink
Merge pull request #30 from cp3-llbb/delaere-version2
Browse files Browse the repository at this point in the history
Minor fixes to das_import.py and SAMADhi_dbAnalysis.py, plus a new script to check samples and clean the db.
  • Loading branch information
delaere authored Aug 27, 2018
2 parents b465332 + 5d68d72 commit bb1a204
Show file tree
Hide file tree
Showing 5 changed files with 211 additions and 5 deletions.
8 changes: 8 additions & 0 deletions python/das_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,19 @@ def query_das(dataset):
for key, value in d.items():
metadata[key] = value

# Set release in global tag
metadata.update({
u'release': unicode(release_results["data"][0]["release"][0]["name"][0]),
u'globalTag': unicode(config_results["data"][0]["config"][0]["global_tag"])
})

# Last chance for the global tag
for d in config_results["data"]:
if metadata[u'globalTag']==u'UNKNOWN':
metadata[u'globalTag']=unicode(d["config"][0]["global_tag"])
if metadata[u'globalTag']==u'UNKNOWN':
del metadata[u'globalTag']

return metadata

def import_cms_dataset(dataset, process=None, energy=None, xsection=1.0, comment="", prompt=False):
Expand Down
29 changes: 24 additions & 5 deletions scripts/SAMADhi_dbAnalysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from datetime import date
from collections import defaultdict
from cp3_llbb.SAMADhi.SAMADhi import Analysis, Dataset, Sample, Result, DbStore
from cp3_llbb.SAMADhi.SAMADhi import File as SFile
from storm.info import get_cls_info
from datetime import datetime
from collections import defaultdict
Expand Down Expand Up @@ -342,13 +343,31 @@ def checkSamplePath(dbstore,opts):
array = []
for sample in result:
# check that the path exists, and keep track of the sample if not the case.
if not os.path.exists(sample.path):
print "Sample #%s (created on %s by %s):"%(str(sample.sample_id),str(sample.creation_time),str(sample.author)),
print " missing path: %s" %sample.path
array.append(sample)
vpath = getSamplePath(sample,dbstore)
for path in vpath:
if not os.path.exists(path):
print "Sample #%s (created on %s by %s):"%(str(sample.sample_id),str(sample.creation_time),str(sample.author)),
print " missing path: %s" %path
print vpath
array.append(sample)
break
if len(array)==0: print "None"
return array

def getSamplePath(sample,dbstore):
vpath=[]
# the path should be stored in sample.path
# if it is empty, look for files in that path
if sample.path=="":
regex = r".*SFN=(.*)"
files = dbstore.find(SFile, SFile.sample_id==sample.sample_id)
for file in files:
m = re.search(regex,str(file.pfn))
if m: vpath.append(os.path.dirname(m.group(1)))
vpath=list(set(vpath))
return vpath
else:
return [sample.path]

def selectResults(dbstore,opts):
# look for result records pointing to a ROOT file
Expand Down Expand Up @@ -466,7 +485,7 @@ def analyzeAnalysisStatistics(dbstore,opts):
regex = r".*([A-Z]{3})-\d{2}-\d{3}"
stats["physicsGroup"] = defaultdict(int)
for analysis in analyses:
m = re.search(regex,analysis.cadiline)
m = re.search(regex,str(analysis.cadiline))
physicsGroup = "NONE"
if m:
physicsGroup = m.group(1)
Expand Down
178 changes: 178 additions & 0 deletions scripts/checkAndClean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
#!/usr/bin/env python
import json
import os
import sys
from cp3_llbb.SAMADhi.SAMADhi import Analysis, Dataset, Sample, Result, DbStore
from cp3_llbb.SAMADhi.SAMADhi import File as SFile
from optparse import OptionParser, OptionGroup

class MyOptionParser:
"""
Client option parser
"""
def __init__(self):
usage = "Usage: %prog [options]\n"
self.parser = OptionParser(usage=usage)
self.parser.add_option("-p","--path", action="store", type="string",
dest="path", default="./",
help="Path to the json files with db analysis results.")
self.parser.add_option("-o","--output", action="store", type="string",
dest="output", default="-",
help="Name of the output file.")
self.parser.add_option("-M","--cleanupMissing", action="store_true",
dest="cleanupMissing", default=False,
help="Clean samples with missing path from the database.")
self.parser.add_option("-U","--cleanupUnreachable", action="store_true",
dest="cleanupUnreachable", default=False,
help="Clean samples with unreachable path from the database.")
self.parser.add_option("-D","--cleanupDatasets", action="store_true",
dest="cleanupDatasets", default=False,
help="Clean orphan datasets from the database.")
self.parser.add_option("-w","--whitelist", action="store", type="string",
dest="whitelist", default=None,
help="JSON file with sample whitelists per analysis.")
self.parser.add_option("-d","--dry-run", action="store_true",
dest="dryrun", default=False,
help="Dry run: do not write to file and/or touch the database.")

def get_opt(self):
"""
Returns parse list of options
"""
opts, args = self.parser.parse_args()
if opts.path is not None:
opts.path = os.path.abspath(os.path.expandvars(os.path.expanduser(opts.path)))
if opts.output == "-":
opts.output = sys.__stdout__
else:
filepath = os.path.dirname(os.path.realpath(os.path.expanduser(opts.output)))
if not os.access(filepath,os.W_OK):
self.parser.error("Cannot write to %s"%filepath)
if os.path.isfile(opts.output):
self.parser.error("File already exists: %s"%opts.output)
if not opts.dryrun:
try:
opts.output = open(opts.output,"w")
except:
self.parser.error("Cannot write to %s"%opts.output)
else:
opts.output = sys.__stdout__
try:
opts.whitelist = open(opts.whitelist)
except:
self.parser.error("Cannot open whitelist.")
return opts

class StoreCleaner():
"""
handle to the db store, with basic facilities to cleanup entries
"""

def __init__(self):
self.dbstore = DbStore()

def deleteSample(self,sample_id):
store = self.dbstore
# first remove the files associated with the sample
files = store.find(SFile,SFile.sample_id==sample_id)
for sampleFile in files:
store.remove(sampleFile)
# then remove the sample
sample = store.find(Sample,Sample.sample_id==sample_id).one()
print("deleting sample %d"%sample_id)
store.remove(sample)

def deleteDataset(self,dataset_id):
store = self.dbstore
# simply delete the dataset
dataset = store.find(Dataset,Dataset.dataset_id==dataset_id).one()
print("deleting dataset %d"%dataset_id)
store.remove(dataset)

def commit(self):
self.dbstore.commit()

def rollback(self):
self.dbstore.rollback()


# Script to check samples for deletion

def main():
"""Main function"""
# get the options
optmgr = MyOptionParser()
opts = optmgr.get_opt()

# set stdout
sys.stdout = opts.output

# whitelist with samples that we should not touch ever
if opts.whitelist is not None:
whitelist = json.load(opts.whitelist)
else:
whitelist = {}

# utility class to clean the db
myCleaner = StoreCleaner()

# open the sample analysis report and classify bad samples
samplesAnalysisReport = os.path.join(opts.path, "SamplesAnalysisReport.json")
with open(samplesAnalysisReport) as jfile:
data = json.load(jfile)
samples = data["MissingDirSamples"]
investigate = []
delete = []
empty = []
empty_delete = []
for sample in samples:
whitelisted = False
for v in whitelist.values():
for label in v:
if label in sample["name"]:
whitelisted = True
if whitelisted:
if sample["path"]=="":
empty.append(sample)
else:
investigate.append(sample)
else:
if sample["path"]=="":
empty_delete.append(sample)
else:
delete.append(sample)
print("\n\nWhitelisted sample with missing path. Investigate:")
for sample in empty:
print(sample["name"])
print("\n\nWhitelisted sample with unreachable path. Investigate:")
for sample in investigate:
print(sample["name"])
print("\n\nSamples to be deleted because of missing path:")
for sample in empty_delete:
print(sample["name"])
if opts.cleanupMissing : myCleaner.deleteSample(sample["sample_id"])
print("\n\nSamples to be deleted because of unreachable path:")
for sample in delete:
print(sample["name"])
if opts.cleanupUnreachable : myCleaner.deleteSample(sample["sample_id"])

# now clean orphan datasets
datasetsAnalysisReport = os.path.join(opts.path, "DatasetsAnalysisReport.json")
with open(datasetsAnalysisReport) as jfile:
data = json.load(jfile)
datasets = data["Orphans"]
for dataset in datasets:
if opts.cleanupDatasets : myCleaner.deleteDataset(dataset["dataset_id"])

# and commit
if not opts.dryrun:
myCleaner.commit()
else:
myCleaner.rollback()

#
# main
#
if __name__ == '__main__':
main()

Empty file modified scripts/das_import.py
100644 → 100755
Empty file.
1 change: 1 addition & 0 deletions setup_standalone.sh
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ checkAndPrepend "PYTHONPATH" "${pysitedir}"
checkAndPrepend "PYTHONPATH" "${installpath}/lib64/python${pymajmin}/site-packages"
( ${python} -c "import MySQLdb" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" MySQL-python
( ${python} -c "import storm" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" storm
( ${python} -c "import ROOT" > /dev/null 2> /dev/null ) || ${python} -m pip install --prefix="${installpath}" storm

## Install SAMADhi
if [[ ! -d "${pysitedir}/cp3_llbb" ]]; then
Expand Down

0 comments on commit bb1a204

Please sign in to comment.