d5/dc8/sortDataLoggerFiles_8py_source.html

 #!/usr/bin/env python

 #

 # Documentation can be rendered in python interactive session with

 # `import sortDataLoggerFiles; help(sortDataLoggerFiles)` and on command line

 # with `sortDataLoggerFiles --help`.

 #

 # It may require ROOT.

 #

 # Changes:

 # 20210411 (petrillo@slac.fnal.gov) [1.0]

 #   first public version

 # 20210518 (petrillo@slac.fnal.gov) [1.1]

 #   added options for duplicate events

 # 20210602 (petrillo@slac.fnal.gov) [1.2]

 #   added optional stream name to the file name pattern;

 #   fixed a bug where first logger option value would be ignored

 # 20220222 (petrillo@slac.fnal.gov) [1.3]

 #   added support for a new file name format, and for multiple formats

 #


 import sys, os

 import re

 import logging


 __doc__ = """Sorts a list of data logger output files.


 File paths are read from all the specified file lists in sequence, or from

 standard input if no file list is specified.


 If a line is encountered that does not match the typical file name pattern,

 that line is ignored and a warning is printed.


 Comments and empty lines at the beginning of the first file list are printed

 at the top of the output as they are. All other comments and empty lines are

 printed at the end of the output.


 Note that it is possible to sort "in place" by specifying the same file list as

 input and output.


 Duplicate files are files on the same run, data logger cycle and data logger

 number. By default, only the first of the duplicate files is written into the

 output list, and only the number of duplicates is printed. Options allow to

 write a detailed list of duplicate files on screen and on disk, or not to check

 for duplication altogether.


 """


 __author__ = 'Gianluca Petrillo (petrillo@slac.stanford.edu)'

 __date__ = 'February 22, 2022'

 __version__ = '1.3'


 class CycleCompareClass:

   """Provides less() to compare numbers with a single offset cycle.


   For example, with offset 3 the order of [0:20] would be, from the lowest:

   [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0, 1, 2, ]

   """

   def __init__(self, first): self.first = first

   def less(self, a, b): return (a < b) == ((a < self.first) == (b < self.first))

 # class CycleCompareClass


 class FileNameParser:

   """Static object (namespace?) containing file name parsing utilities.


   All supported file name patterns are in the `Patterns` class variable.

   The static method `match()` tries all of them, in order.

   """

   Patterns = [

     {

       'Name': 'general',

       # pattern parameters:   data logger stream stream name run  pass  filler (timestamp)

       #                              <1>  <2>    <3>         <4>   <5>  <6>

       'Pattern': re.compile(r"data_dl(\d+)(_fstrm([^_]*))?_run(\d+)_(\d+)_(.*)\.root"),

       'Parameters': {

         'DataLogger': ( 1, int ),

         'StreamName': ( 3, str ),

         'RunNumber' : ( 4, int ),

         'PassCount' : ( 5, int ),

       },

     }, # general

     {

       'Name': 'multistream',

       # pattern parameters:   stream name data logger run  pass  filler (timestamp)

       #                       <1>            <2>      <3>   <4>   <5>

       'Pattern': re.compile(r"([^_]+)_data_dl(\d+)_run(\d+)_(\d+)_(.*)\.root"),

       'Parameters': {

         'DataLogger': ( 2, int ),

         'StreamName': ( 1, str ),

         'RunNumber' : ( 3, int ),

         'PassCount' : ( 4, int ),

       },

     }, # general

   ] # Patterns


   class ParsedNameClass:

     def __init__(self, name, fields):

       self.name = name

       self.fields = fields

     # __init__()

     def __nonzero__(self): return len(self.fields) > 0

     def get(self, *fieldNames):

       return tuple(self.fields.get(fieldName, None) for fieldName in fieldNames)

   # class ParsedNameClass


   def __init__(self): pass


   @staticmethod

   def match(name):

     # the first successful pattern is used

     for patternInfo in FileNameParser.Patterns:

       match = patternInfo['Pattern'].match(name)

       if match is None: continue

       d = dict(

           ( name, ( type_(value) if (value := match.group(index)) else None ) )

           for name, ( index, type_ ) in patternInfo['Parameters'].items()

         )

       d['Name'] = patternInfo['Name']

       return FileNameParser.ParsedNameClass(name, d)

     else: return FileNameParser.ParsedNameClass(name, {})

   # match()

 # class FileNameParser


 class FileInfoClass:

   """This class collects information about a input file, including a sorting

   criterium.

   """


   XRootDprotocolHead = 'root://fndca1.fnal.gov:1094/'

   XRootDprotocolDir = 'pnfs/fnal.gov/usr'

   POSIXprotocolHead = '/'

   POSIXprotocolDir = 'pnfs'


   POSIXPattern = re.compile(

     POSIXprotocolHead.replace('.', r'\.')

     + POSIXprotocolDir.replace('.', r'\.')

     + r"/([^/]+)/(.*)")

   XRootDPattern = re.compile(

     XRootDprotocolHead.replace('.', r'\.')

     + XRootDprotocolDir.replace('.', r'\.')

     + r"/(.*)"

     )

   _DataLoggerSorter = CycleCompareClass(first=4)


   @staticmethod

   def getFirstDataLogger(index): return FileInfoClass._DataLoggerSorter.first

   @staticmethod

   def setFirstDataLogger(index):

     FileInfoClass._DataLoggerSorter = CycleCompareClass(first=index)


   def __init__(self,

                line: "input file line (should include endline)",

                source: "an arbitrary identifier to track the origin of the line" = None,

                ):

     """Constructor: use and parse the specified input file line."""

     self.line = line

     self.source = source

     self.path = line.strip()

     self.protocolAndDir, self.name = os.path.split(self.path)

     parsedName = FileNameParser.match(self.name)

     self.is_file = bool(parsedName)

     if self.is_file:

       self.dataLogger, self.run, self.stream, self.pass_ \

         = parsedName.get('DataLogger', 'RunNumber', 'StreamName', 'PassCount')

   # __init__()


   def __lt__(self, other):

     """Comparison: run, then pass, then (offset cycled) data logger number."""

     if not self.is_file:

       raise RuntimeError \

         ("Sorting not supported for non-file objects ('%s')" % self.path)

     # if

     if self.run < other.run: return True

     if self.run > other.run: return False


     if self.pass_ < other.pass_: return True

     if self.pass_ > other.pass_: return False


     if self.dataLogger != other.dataLogger:

       return \

         FileInfoClass._DataLoggerSorter.less(self.dataLogger, other.dataLogger)


     assert (self.stream is None) == (other.stream is None)

     return False if self.stream is None else self.stream < other.stream


   # __lt__()


   def __str__(self):

     s = f"Run {self.run} cycle {self.pass_} data logger {self.dataLogger}"

     if self.stream: s += f" stream {self.stream}"

     return s


   def pathToXRootD(self) -> "stored file path in XRootD format":

     if not self.is_file:

       raise RuntimeError(

        "XRootD conversion not supported for non-file objects ('%s')" % self.path

        )

     # if not file

     match = FileInfoClass.POSIXPattern.match(self.path)

     return os.path.join(

       FileInfoClass.XRootDprotocolHead, FileInfoClass.XRootDprotocolDir,

       *match.group(1, 2)

       ) if match else self.path


   # pathToXRootD()


   def pathToPOSIX(self) -> "stored file path in POSIX (PNFS local) format":

     if not self.is_file:

       raise RuntimeError(

        "XRootD conversion not supported for non-file objects ('%s')" % self.path

        )

     # if not file

     match = FileInfoClass.XRootDPattern.match(self.path)

     return os.path.join(

         FileInfoClass.POSIXprotocolHead, FileInfoClass.POSIXprotocolDir,

         match.group(1)

       ) if match else self.path


   # pathToXRootD()


 # class FileInfoClass


 class MinimumAccumulator:

   def add(self, data, key = None):

     if key is None: key = data

     try:

       if key >= self.minKey: return False

     except AttributeError: pass # no self.minKey yet?

     self.minKey = key

     self.minData = data

     return True

   # add()

   def min(self): return self.minData

 # class MinimumAccumulator


 def findFirstCycle(files, stream):

   firstLogger = None

   firstPassFiles = []

   wrapped = False

   for info in files:

     if info.stream != stream: continue

     if firstLogger == info.dataLogger: break # cycle completed

     if wrapped and info.dataLogger > firstLogger: break # cycle completed


     if firstLogger is None: firstLogger = info.dataLogger

     elif not wrapped and info.dataLogger < firstLogger: wrapped = True


     firstPassFiles.append(info)

     logging.debug("Added cycle %d logger %d stream %s to first cycle list",

       info.pass_, info.dataLogger, info.stream)

   # for

   return firstPassFiles

 # findFirstCycle()


 def extractFirstEvent(filePath):

   try: import ROOT

   except ImportError:

     raise RuntimeError("""ROOT python module could not be loaded.

       In this condition, you'll have to skip the autodetection of the first logger

       by explicitly specifying its number as option to the script."""

       )

   # try ... except

   logging.debug("Opening '%s' for event number check...", filePath)

   srcFile = ROOT.TFile.Open(filePath, "READ")

   if not srcFile:

     raise RuntimeError \

       ("Failed to open '%s' for event number extraction." % filePath)

   #

   try: firstEvent = next(iter(srcFile.Events)) # go PyROOT

   except StopIteration:

     logging.debug("File '%s' appears to contain no events.", filePath)

     return None

   firstEventNumber = firstEvent.EventAuxiliary.event() # keep going PyROOT


   logging.debug("First event from '%s': %d", filePath, firstEventNumber)

   return firstEventNumber

 # extractFirstEvent()


 def detectFirstLogger(fileInfo):

   # in the end, we don't need a stream-aware algorithm to determine which

   # data logger received the first event, as long as we have all relevant

   # streams represented

   lowestEvent = MinimumAccumulator()

   for stream, files in fileInfo.items():

     if not len(files): continue

     for info in files:

       firstEvent = extractFirstEvent(info.pathToXRootD())

       if firstEvent is not None:

         lowestEvent.add(info, key=firstEvent)

         if firstEvent == 1: break # can't get lower than this!

     # for files

   # for

   try: firstLogger = lowestEvent.min().dataLogger

   except AttributeError:

     # this is in general a problem because it implies that we are failing to

     # correctly parse the list of input files

     raise RuntimeError("No data found for the first data logger pass.")

   logging.debug("Detected first logger: %d", firstLogger)

   return firstLogger

 # detectFirstLogger()


 def buildFileIndex(

   fileInfo: "list with information from all files",

   ) -> "a dictionary: { key -> list of files }":


   fileKey = lambda info: ( info.run, info.pass_, info.dataLogger, info.stream, )

   index = {}

   for info in fileInfo:

     index.setdefault(fileKey(info), []).append(info)

   return index

 # buildFileIndex()


 if __name__ == "__main__":


   logging.basicConfig(level=logging.INFO)


   import argparse


   parser = argparse.ArgumentParser(description=__doc__)

   parser.set_defaults(skipDuplicates=True)


   parser.add_argument('inputFiles', nargs="*", metavar='inputFileNames',

     help='input file lists [one from stdin by default]')

   parser.add_argument('--firstlogger', type=int,

     help='index of the first data logger in the cycle')

   parser.add_argument('--output', '-o', default=None,

     help=

      'name of the file to write the resulting list into (overwritten!) [stdout]'

     )

   parser.add_argument('--nooutput', action="store_true",

     help='do not print on screen nor write to file the files in input')


   duplGroup = parser.add_argument_group(title="duplicate file options")

   duplGroup.add_argument('--printduplicates', '-d', action="store_true",

     help='print duplicate files on screen')

   duplGroup.add_argument('--skipduplicates', '-S', dest='skipDuplicates',

     action="store_true",

     help='do not include duplicate files in the list (default)'

     )

   duplGroup.add_argument('--keepduplicates', '-K', dest='skipDuplicates',

     action="store_false",

     help='include also duplicate files in the list (default)'

     )

   duplGroup.add_argument('--duplicatelist', '-D', type=str, default=None,

     help='name of a file list to be created with duplicate entries')


   parser.add_argument('--xrootd', '--root', '-X', action="store_true",

     help='convert the paths to XRootD URL')

   parser.add_argument('--posix', '-P', action="store_true",

     help='convert the paths to local POSIX path')

   parser.add_argument('--debug', action="store_true",

     help='prints out debugging messages')

   parser.add_argument \

     ('--version', '-V', action='version', version='%(prog)s ' + __version__)


   args = parser.parse_args()


   if args.debug: logging.getLogger().setLevel(logging.DEBUG)


   if args.xrootd and args.posix:

     raise RuntimeError("XRootD and POSIX output format options are exclusive.")


   printDuplicates = args.printduplicates

   skipDuplicates = args.skipDuplicates

   makeDuplicateList = args.duplicatelist


   # "sources" are given directly as input (None = sys.stdin)

   sources = args.inputFiles if args.inputFiles else [ "<stdin>" ]


   # "inputFiles" are all the files found in the sources

   inputFiles = (

       [ file_ ] if file_.endswith('.root') else open(file_, 'r')

         for file_ in args.inputFiles

     ) if args.inputFiles else [ sys.stdin, ]


   # example: /pnfs/icarus/persistent/users/ascarpel/trigger/4989/decoded/17247391_0/data_dl2_run4989_1_20210219T015125_20210219T200434-decode.root


   preComments = []

   postComments = []

   fileInfo = []

   sourceNames = []

   for iSource, file_ in enumerate(inputFiles):

     isSingleFile = isinstance(file_, list) and len(file_) <= 1

     for iLine, line in enumerate(file_):

       info = FileInfoClass(line, source=( iSource, None if isSingleFile else iLine + 1 ))

       if not info.is_file:

         if not info.path or info.path.startswith('#'):

           (postComments if fileInfo else preComments).append(info.line)

           continue

         else:

           logging.warning \

            ("Line %d ('%s') does not match file pattern." % (iLine, info.path))

           continue

       # if not file

       fileInfo.append(info)

     # for line in file

   # for input files


   Streams = list(set( info.stream for info in fileInfo ))

   logging.debug("%d data files in %d streams: %s",

     len(fileInfo), len(Streams),

     ", ".join(stream if stream else "<none>" for stream in Streams)

     )


   if fileInfo and (args.firstlogger is None):

     # uses internal FileInfoClass ordering (firstLogger not set: any will do)

     fileInfo.sort()

     firstPassFiles = dict( ( stream, findFirstCycle(fileInfo, stream) )

       for stream in Streams )

     assert firstPassFiles

     firstLogger = detectFirstLogger(firstPassFiles)

   else: firstLogger = args.firstlogger if args.firstlogger is not None else 4


   FileInfoClass.setFirstDataLogger(firstLogger)


   fileInfo.sort() # uses internal FileInfoClass ordering


   #

   # deal with duplicates

   #

   if printDuplicates or makeDuplicateList or skipDuplicates:

     nDuplicates = 0

     fileIndex = buildFileIndex(fileInfo)

     uniqueFiles = [] if skipDuplicates else None

     duplicateFiles = [] if makeDuplicateList else None

     # we rely on insertion-ordered dictionary guarantee of Python 3.7

     for fileList in fileIndex.values():

       mainInfo = fileList[0]

       if uniqueFiles is not None: uniqueFiles.append(mainInfo)

       if len(fileList) > 1:

         nDuplicates += len(fileList) - 1

         if duplicateFiles is not None: duplicateFiles.extend(fileList[1:])

         if printDuplicates:

           firstSource = mainInfo.source[0]

           msg += f"{mainInfo} with {len(fileList) - 1} duplicates of"


           if len(sources) > 1: msg += f" {sources[mainInfo.source[0]]}"

           if mainInfo.source[1] is not None: msg += f" line {mainInfo.source[1]}"

           msg += ":"

           for info in fileList[1:]:

             if info.source[0] != firstSource: msg += f"{sources[info.source[0]]}"

             if info.source[1] is not None: msg += f" line {info.source[1]}"

             msg += ";"

           # for

           logging.info(msg)

         # if print duplicates

       # if duplicates

     # for

     if nDuplicates: logging.info(f"Found {nDuplicates} duplicate files.")

     if duplicateFiles:

       with open(makeDuplicateList, 'w') as DuplicateListFile:

         for info in duplicateFiles: # lines still have their <CR>

           print(info.line, file=DuplicateListFile, end='')

       logging.info(f"{nDuplicates} duplicate file names written in '{makeDuplicateList}'.")

     # if we have duplicates and we write them

   # if print or store duplicates


   fileListContent = uniqueFiles if skipDuplicates else fileInfo


   #

   # print everything

   #


   # NOTE: keep this after all the input has been read,

   #       so that input files can be safely overwritten

   if not args.nooutput:

     outputFile = open(args.output, 'w') if args.output else sys.stdout


     # <CR> were not removed from `line`

     for line in preComments: outputFile.write(line)

     for info in fileListContent:

       if args.posix: line = info.pathToPOSIX() + '\n'

       elif args.xrootd: line = info.pathToXRootD() + '\n'

       else: line = info.line

       outputFile.write(line)

     for line in postComments: outputFile.write(line)


     if outputFile is not sys.stdout:

       logging.info \

         (f"{len(fileListContent)} file entries written into '{outputFile.name}'.")

       del outputFile

     # if

   else:

     logging.info(f"Found {len(fileListContent)} file entries.")

   # if ... else


   sys.exit(0)


 # main

sortDataLoggerFiles.FileInfoClass.__init__
def __init__
Definition: sortDataLoggerFiles.py:154

sortDataLoggerFiles.FileNameParser
Definition: sortDataLoggerFiles.py:64

sortDataLoggerFiles.FileInfoClass
Definition: sortDataLoggerFiles.py:126

sortDataLoggerFiles.FileInfoClass.getFirstDataLogger
def getFirstDataLogger
Definition: sortDataLoggerFiles.py:148

sortDataLoggerFiles.FileNameParser.ParsedNameClass.__nonzero__
def __nonzero__
Definition: sortDataLoggerFiles.py:102

print
do one_file $F done echo for F in find $TOP name CMakeLists txt print
Definition: update_sources.sh:94

sortDataLoggerFiles.CycleCompareClass.__init__
def __init__
Definition: sortDataLoggerFiles.py:59

larg4::bool
static constexpr bool
Definition: OpFastScintillation.hh:588

sortDataLoggerFiles.MinimumAccumulator.min
def min
Definition: sortDataLoggerFiles.py:236

util::enumerate
auto enumerate(Iterables &&...iterables)
Range-for loop helper tracking the number of iteration.
Definition: enumerate.h:69

opdet::factory::join
S join(S const &sep, Coll const &s)
Returns a concatenation of strings in s separated by sep.
Definition: OpRecoFactoryStuff.h:573

sortDataLoggerFiles.CycleCompareClass.first
first
Definition: sortDataLoggerFiles.py:59

sortDataLoggerFiles.FileInfoClass.__str__
def __str__
Definition: sortDataLoggerFiles.py:190

sortDataLoggerFiles.buildFileIndex
def buildFileIndex
Definition: sortDataLoggerFiles.py:310

sortDataLoggerFiles.MinimumAccumulator.minKey
minKey
Definition: sortDataLoggerFiles.py:232

sortDataLoggerFiles.FileInfoClass.pathToXRootD
def pathToXRootD
Definition: sortDataLoggerFiles.py:195

sortDataLoggerFiles.MinimumAccumulator
Definition: sortDataLoggerFiles.py:226

sortDataLoggerFiles.FileInfoClass.setFirstDataLogger
def setFirstDataLogger
Definition: sortDataLoggerFiles.py:150

sortDataLoggerFiles.FileNameParser.match
def match
Definition: sortDataLoggerFiles.py:110

sortDataLoggerFiles.FileNameParser.ParsedNameClass.get
def get
Definition: sortDataLoggerFiles.py:103

sortDataLoggerFiles.MinimumAccumulator.add
def add
Definition: sortDataLoggerFiles.py:227

sortDataLoggerFiles.MinimumAccumulator.minData
minData
Definition: sortDataLoggerFiles.py:233

sortDataLoggerFiles.FileInfoClass.line
line
Definition: sortDataLoggerFiles.py:158

sortDataLoggerFiles.findFirstCycle
def findFirstCycle
Definition: sortDataLoggerFiles.py:240

sortDataLoggerFiles.detectFirstLogger
def detectFirstLogger
Definition: sortDataLoggerFiles.py:285

sortDataLoggerFiles.FileInfoClass.path
path
Definition: sortDataLoggerFiles.py:160

sortDataLoggerFiles.FileNameParser.__init__
def __init__
Definition: sortDataLoggerFiles.py:107

sortDataLoggerFiles.CycleCompareClass.less
def less
Definition: sortDataLoggerFiles.py:60

sortDataLoggerFiles.FileNameParser.ParsedNameClass.fields
fields
Definition: sortDataLoggerFiles.py:100

sortDataLoggerFiles.extractFirstEvent
def extractFirstEvent
Definition: sortDataLoggerFiles.py:260

sortDataLoggerFiles.FileNameParser.ParsedNameClass.__init__
def __init__
Definition: sortDataLoggerFiles.py:98

sortDataLoggerFiles.FileNameParser.ParsedNameClass.name
name
Definition: sortDataLoggerFiles.py:99

sortDataLoggerFiles.FileNameParser.ParsedNameClass
Definition: sortDataLoggerFiles.py:97

sortDataLoggerFiles.FileInfoClass.name
name
Definition: sortDataLoggerFiles.py:161

sortDataLoggerFiles.FileInfoClass.is_file
is_file
Definition: sortDataLoggerFiles.py:163

sortDataLoggerFiles.CycleCompareClass
Definition: sortDataLoggerFiles.py:53

list
list
Definition: file_to_url.sh:28

sortDataLoggerFiles.FileInfoClass.source
source
Definition: sortDataLoggerFiles.py:159

sortDataLoggerFiles.FileInfoClass.pathToPOSIX
def pathToPOSIX
Definition: sortDataLoggerFiles.py:209

sortDataLoggerFiles.FileInfoClass.__lt__
def __lt__
Definition: sortDataLoggerFiles.py:169

open
open(RACETRACK) or die("Could not open file $RACETRACK for writing")