25 __doc__ =
"""Sorts a list of data logger output files.
27 File paths are read from all the specified file lists in sequence, or from
28 standard input if no file list is specified.
30 If a line is encountered that does not match the typical file name pattern,
31 that line is ignored and a warning is printed.
33 Comments and empty lines at the beginning of the first file list are printed
34 at the top of the output as they are. All other comments and empty lines are
35 printed at the end of the output.
37 Note that it is possible to sort "in place" by specifying the same file list as
40 Duplicate files are files on the same run, data logger cycle and data logger
41 number. By default, only the first of the duplicate files is written into the
42 output list, and only the number of duplicates is printed. Options allow to
43 write a detailed list of duplicate files on screen and on disk, or not to check
44 for duplication altogether.
48 __author__ =
'Gianluca Petrillo (petrillo@slac.stanford.edu)'
49 __date__ =
'February 22, 2022'
54 """Provides less() to compare numbers with a single offset cycle.
56 For example, with offset 3 the order of [0:20] would be, from the lowest:
57 [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 0, 1, 2, ]
60 def less(self, a, b):
return (a < b) == ((a < self.
first) == (b < self.
first))
65 """Static object (namespace?) containing file name parsing utilities.
67 All supported file name patterns are in the `Patterns` class variable.
68 The static method `match()` tries all of them, in order.
75 'Pattern': re.compile(
r"data_dl(\d+)(_fstrm([^_]*))?_run(\d+)_(\d+)_(.*)\.root"),
77 'DataLogger': ( 1, int ),
78 'StreamName': ( 3, str ),
79 'RunNumber' : ( 4, int ),
80 'PassCount' : ( 5, int ),
84 'Name':
'multistream',
87 'Pattern': re.compile(
r"([^_]+)_data_dl(\d+)_run(\d+)_(\d+)_(.*)\.root"),
89 'DataLogger': ( 2, int ),
90 'StreamName': ( 1, str ),
91 'RunNumber' : ( 3, int ),
92 'PassCount' : ( 4, int ),
103 def get(self, *fieldNames):
104 return tuple(self.fields.get(fieldName,
None)
for fieldName
in fieldNames)
112 for patternInfo
in FileNameParser.Patterns:
113 match = patternInfo[
'Pattern'].
match(name)
114 if match
is None:
continue
116 ( name, ( type_(value)
if (value := match.group(index))
else None ) )
117 for name, ( index, type_ )
in patternInfo[
'Parameters'].items()
119 d[
'Name'] = patternInfo[
'Name']
127 """This class collects information about a input file, including a sorting
131 XRootDprotocolHead =
'root://fndca1.fnal.gov:1094/'
132 XRootDprotocolDir =
'pnfs/fnal.gov/usr'
133 POSIXprotocolHead =
'/'
134 POSIXprotocolDir =
'pnfs'
136 POSIXPattern = re.compile(
137 POSIXprotocolHead.replace(
'.',
r'\.')
138 + POSIXprotocolDir.replace(
'.',
r'\.')
140 XRootDPattern = re.compile(
141 XRootDprotocolHead.replace(
'.',
r'\.')
142 + XRootDprotocolDir.replace(
'.',
r'\.')
154 line:
"input file line (should include endline)",
155 source:
"an arbitrary identifier to track the origin of the line" =
None,
157 """Constructor: use and parse the specified input file line."""
161 self.protocolAndDir, self.
name = os.path.split(self.
path)
162 parsedName = FileNameParser.match(self.
name)
165 self.dataLogger, self.run, self.stream, self.pass_ \
166 = parsedName.get(
'DataLogger',
'RunNumber',
'StreamName',
'PassCount')
170 """Comparison: run, then pass, then (offset cycled) data logger number."""
173 (
"Sorting not supported for non-file objects ('%s')" % self.
path)
175 if self.run < other.run:
return True
176 if self.run > other.run:
return False
178 if self.pass_ < other.pass_:
return True
179 if self.pass_ > other.pass_:
return False
181 if self.dataLogger != other.dataLogger:
183 FileInfoClass._DataLoggerSorter.less(self.dataLogger, other.dataLogger)
185 assert (self.stream
is None) == (other.stream
is None)
186 return False if self.stream
is None else self.stream < other.stream
191 s = f
"Run {self.run} cycle {self.pass_} data logger {self.dataLogger}"
192 if self.stream: s += f
" stream {self.stream}"
198 "XRootD conversion not supported for non-file objects ('%s')" % self.
path
201 match = FileInfoClass.POSIXPattern.match(self.
path)
203 FileInfoClass.XRootDprotocolHead, FileInfoClass.XRootDprotocolDir,
205 )
if match
else self.
path
209 def pathToPOSIX(self) -> "stored file path in POSIX (PNFS local) format
":
212 "XRootD conversion not supported for non-file objects ('%s')" % self.
path
215 match = FileInfoClass.XRootDPattern.match(self.
path)
217 FileInfoClass.POSIXprotocolHead, FileInfoClass.POSIXprotocolDir,
219 )
if match
else self.
path
227 def add(self, data, key = None):
228 if key
is None: key = data
230 if key >= self.
minKey:
return False
231 except AttributeError:
pass
236 def min(self):
return self.minData
245 if info.stream != stream:
continue
246 if firstLogger == info.dataLogger:
break
247 if wrapped
and info.dataLogger > firstLogger:
break
249 if firstLogger
is None: firstLogger = info.dataLogger
250 elif not wrapped
and info.dataLogger < firstLogger: wrapped =
True
252 firstPassFiles.append(info)
253 logging.debug(
"Added cycle %d logger %d stream %s to first cycle list",
254 info.pass_, info.dataLogger, info.stream)
256 return firstPassFiles
263 raise RuntimeError(
"""ROOT python module could not be loaded.
264 In this condition, you'll have to skip the autodetection of the first logger
265 by explicitly specifying its number as option to the script."""
268 logging.debug(
"Opening '%s' for event number check...", filePath)
269 srcFile = ROOT.TFile.Open(filePath,
"READ")
272 (
"Failed to open '%s' for event number extraction." % filePath)
274 try: firstEvent = next(iter(srcFile.Events))
275 except StopIteration:
276 logging.debug(
"File '%s' appears to contain no events.", filePath)
278 firstEventNumber = firstEvent.EventAuxiliary.event()
280 logging.debug(
"First event from '%s': %d", filePath, firstEventNumber)
281 return firstEventNumber
290 for stream, files
in fileInfo.items():
291 if not len(files):
continue
294 if firstEvent
is not None:
295 lowestEvent.add(info, key=firstEvent)
296 if firstEvent == 1:
break
299 try: firstLogger = lowestEvent.min().dataLogger
300 except AttributeError:
303 raise RuntimeError(
"No data found for the first data logger pass.")
304 logging.debug(
"Detected first logger: %d", firstLogger)
310 fileInfo:
"list with information from all files",
311 ) ->
"a dictionary: { key -> list of files }":
313 fileKey =
lambda info: ( info.run, info.pass_, info.dataLogger, info.stream, )
315 for info
in fileInfo:
316 index.setdefault(fileKey(info), []).append(info)
321 if __name__ ==
"__main__":
323 logging.basicConfig(level=logging.INFO)
327 parser = argparse.ArgumentParser(description=__doc__)
328 parser.set_defaults(skipDuplicates=
True)
330 parser.add_argument(
'inputFiles', nargs=
"*", metavar=
'inputFileNames',
331 help=
'input file lists [one from stdin by default]')
332 parser.add_argument(
'--firstlogger', type=int,
333 help=
'index of the first data logger in the cycle')
334 parser.add_argument(
'--output',
'-o', default=
None,
336 'name of the file to write the resulting list into (overwritten!) [stdout]'
338 parser.add_argument(
'--nooutput', action=
"store_true",
339 help=
'do not print on screen nor write to file the files in input')
341 duplGroup = parser.add_argument_group(title=
"duplicate file options")
342 duplGroup.add_argument(
'--printduplicates',
'-d', action=
"store_true",
343 help=
'print duplicate files on screen')
344 duplGroup.add_argument(
'--skipduplicates',
'-S', dest=
'skipDuplicates',
346 help=
'do not include duplicate files in the list (default)'
348 duplGroup.add_argument(
'--keepduplicates',
'-K', dest=
'skipDuplicates',
349 action=
"store_false",
350 help=
'include also duplicate files in the list (default)'
352 duplGroup.add_argument(
'--duplicatelist',
'-D', type=str, default=
None,
353 help=
'name of a file list to be created with duplicate entries')
355 parser.add_argument(
'--xrootd',
'--root',
'-X', action=
"store_true",
356 help=
'convert the paths to XRootD URL')
357 parser.add_argument(
'--posix',
'-P', action=
"store_true",
358 help=
'convert the paths to local POSIX path')
359 parser.add_argument(
'--debug', action=
"store_true",
360 help=
'prints out debugging messages')
361 parser.add_argument \
362 (
'--version',
'-V', action=
'version', version=
'%(prog)s ' + __version__)
364 args = parser.parse_args()
366 if args.debug: logging.getLogger().setLevel(logging.DEBUG)
368 if args.xrootd
and args.posix:
369 raise RuntimeError(
"XRootD and POSIX output format options are exclusive.")
371 printDuplicates = args.printduplicates
372 skipDuplicates = args.skipDuplicates
373 makeDuplicateList = args.duplicatelist
376 sources = args.inputFiles
if args.inputFiles
else [
"<stdin>" ]
380 [ file_ ]
if file_.endswith(
'.root')
else open(file_,
'r')
381 for file_
in args.inputFiles
382 )
if args.inputFiles
else [ sys.stdin, ]
390 for iSource, file_
in enumerate(inputFiles):
391 isSingleFile = isinstance(file_, list)
and len(file_) <= 1
393 info =
FileInfoClass(line, source=( iSource,
None if isSingleFile
else iLine + 1 ))
395 if not info.path
or info.path.startswith(
'#'):
396 (postComments
if fileInfo
else preComments).append(info.line)
400 (
"Line %d ('%s') does not match file pattern." % (iLine, info.path))
403 fileInfo.append(info)
407 Streams =
list(set( info.stream
for info
in fileInfo ))
408 logging.debug(
"%d data files in %d streams: %s",
409 len(fileInfo), len(Streams),
410 ", ".
join(stream
if stream
else "<none>" for stream
in Streams)
413 if fileInfo
and (args.firstlogger
is None):
417 for stream
in Streams )
418 assert firstPassFiles
420 else: firstLogger = args.firstlogger
if args.firstlogger
is not None else 4
422 FileInfoClass.setFirstDataLogger(firstLogger)
429 if printDuplicates
or makeDuplicateList
or skipDuplicates:
432 uniqueFiles = []
if skipDuplicates
else None
433 duplicateFiles = []
if makeDuplicateList
else None
435 for fileList
in fileIndex.values():
436 mainInfo = fileList[0]
437 if uniqueFiles
is not None: uniqueFiles.append(mainInfo)
438 if len(fileList) > 1:
439 nDuplicates += len(fileList) - 1
440 if duplicateFiles
is not None: duplicateFiles.extend(fileList[1:])
442 firstSource = mainInfo.source[0]
443 msg += f
"{mainInfo} with {len(fileList) - 1} duplicates of"
445 if len(sources) > 1: msg += f
" {sources[mainInfo.source[0]]}"
446 if mainInfo.source[1]
is not None: msg += f
" line {mainInfo.source[1]}"
448 for info
in fileList[1:]:
449 if info.source[0] != firstSource: msg += f
"{sources[info.source[0]]}"
450 if info.source[1]
is not None: msg += f
" line {info.source[1]}"
457 if nDuplicates: logging.info(f
"Found {nDuplicates} duplicate files.")
459 with
open(makeDuplicateList,
'w')
as DuplicateListFile:
460 for info
in duplicateFiles:
461 print(info.line, file=DuplicateListFile, end=
'')
462 logging.info(f
"{nDuplicates} duplicate file names written in '{makeDuplicateList}'.")
466 fileListContent = uniqueFiles
if skipDuplicates
else fileInfo
475 if not args.nooutput:
476 outputFile =
open(args.output,
'w')
if args.output
else sys.stdout
479 for line
in preComments: outputFile.write(line)
480 for info
in fileListContent:
481 if args.posix: line = info.pathToPOSIX() +
'\n'
482 elif args.xrootd: line = info.pathToXRootD() +
'\n'
483 else: line = info.line
484 outputFile.write(line)
485 for line
in postComments: outputFile.write(line)
487 if outputFile
is not sys.stdout:
489 (f
"{len(fileListContent)} file entries written into '{outputFile.name}'.")
493 logging.info(f
"Found {len(fileListContent)} file entries.")
do one_file $F done echo for F in find $TOP name CMakeLists txt print
auto enumerate(Iterables &&...iterables)
Range-for loop helper tracking the number of iteration.
S join(S const &sep, Coll const &s)
Returns a concatenation of strings in s separated by sep.
open(RACETRACK) or die("Could not open file $RACETRACK for writing")