5 from tqdm.auto
import tqdm
6 from multiprocessing
import Pool
20 return self.
f is not None
24 return dill.dumps({
"f": self.
f,
"name": self.
name})
27 data = dill.loads(state)
29 self.
name = data[
"name"]
34 if not isinstance(dfs, tuple):
35 assert(isinstance(dfs, pd.DataFrame))
40 npad = [max([len(b.split(
"."))
for b
in df.columns])
for df
in dfs]
42 while len(b) < npad[i]:
46 for i
in range(len(dfs)):
47 dfs[i].columns = pd.MultiIndex.from_tuples([pad(b.split(
"."), i)
for b
in dfs[i].columns])
50 for i
in range(len(dfs)):
51 if len(dfs[i].index.names) == 1
and dfs[i].index.names[0]
is None:
52 dfs[i].index = dfs[i].index.set_names([
"entry"])
57 fname, branches, index, applyf = inp
58 with uproot.open(fname)
as f:
59 dfW =
_makedf(f[names.folderW][names.tname].arrays(branches, library=
"pd"))
60 dfE =
_makedf(f[names.folderE][names.tname].arrays(branches, library=
"pd"))
68 dfW[
"__ntuple"] = index
69 dfW.set_index(
"__ntuple", append=
True, inplace=
True)
70 dfW = dfW.reorder_levels([dfW.index.nlevels-1] +
list(range(0, dfW.index.nlevels-1)))
72 dfE[
"__ntuple"] = index + 1
73 dfE.set_index(
"__ntuple", append=
True, inplace=
True)
74 dfE = dfE.reorder_levels([dfE.index.nlevels-1] +
list(range(0, dfE.index.nlevels-1)))
81 with uproot.open(fname)
as f:
87 dfW =
_makedf(rootf[names.folderW][names.tname].arrays(branches, library=
"pd"))
88 dfE =
_makedf(rootf[names.folderE][names.tname].arrays(branches, library=
"pd"))
90 valW = [v(dfW)
for v
in vars]
91 valE = [v(dfE)
for v
in vars]
93 wW = [
w(dfW)
if w
else None for w
in whens]
94 wE = [
w(dfE)
if w
else None for w
in whens]
96 runs = dfW.meta.run.unique()
101 for val, var
in zip(valW, vars):
102 hists[
"W"][r][var.name] = {}
103 for w, when
in zip(wW, whens):
105 hist = np.histogram(val[dfW.meta.run == r], bins=bins)
107 hist = np.histogram(val[w & (dfW.meta.run == r)], bins=bins)
109 hists[
"W"][r][var.name][when.name] = hist
114 for val, var
in zip(valE, vars):
115 hists[
"E"][r][var.name] = {}
116 for w, when
in zip(wE, whens):
118 hist = np.histogram(val[dfE.meta.run == r], bins=bins)
120 hist = np.histogram(val[w & (dfE.meta.run == r)], bins=bins)
122 hists[
"E"][r][var.name][when.name] = hist
128 if isinstance(g, list):
131 self.
glob = glob.glob(g)
134 def dataframe(self, branches=None, maxfile=None, nproc=1, f=None):
136 nproc = multiprocessing.cpu_count()
142 thisglob = thisglob[:maxfile]
145 with Pool(processes=nproc)
as pool:
146 thisglob = [(g, branches, i*2, f)
for i,g
in enumerate(thisglob)]
147 for df
in tqdm(pool.imap_unordered(_loaddf, thisglob), total=len(thisglob), unit=
"file", delay=5):
150 ret = pd.concat(ret, axis=0, ignore_index=
False)
153 sub_index = ret.index.names[2:]
154 ret = ret.reset_index()
155 ret.entry = ret.groupby([
"__ntuple",
"entry"]).ngroup()
156 ret.set_index([
"entry"] + sub_index, inplace=
True, verify_integrity=
True)
157 ret.sort_index(inplace=
True)
162 def histogram(self, var, bins, when=NTupleProc(), flatten_runs=
False, flatten_cryo=
False, maxfile=
None, nproc=1):
164 nproc = multiprocessing.cpu_count()
166 if not isinstance(var, list):
169 if not isinstance(when, list):
176 thisglob = thisglob[:maxfile]
178 globdata = [(f, self.
branches, var, when, bins)
for f
in thisglob]
180 with Pool(processes=nproc)
as pool:
181 for hists
in tqdm(pool.imap_unordered(_process, globdata), total=len(globdata), unit=
"file", delay=5):
182 for cname
in hists.keys():
183 for runname
in hists[cname].keys():
184 for varname
in hists[cname][runname].keys():
185 for whenname
in hists[cname][runname][varname].keys():
188 ret[cname][runname][varname][whenname] = hists[cname][runname][varname][whenname]
190 ret[cname][runname][varname][whenname] = self.
_hadd(hist, hists[cname][runname][varname][whenname])
195 for runname
in ret[
"E"].keys():
196 flatret_cryo[runname] = {}
197 for valname
in ret[
"E"][runname].keys():
198 flatret_cryo[runname][valname] = {}
199 for whenname
in ret[
"E"][runname][valname].keys():
200 flatret_cryo[runname][valname][whenname] = self.
_hadd(ret[
"E"][runname][valname][whenname], ret[
"W"][runname][valname][whenname])
205 flatret_run[
"E"] = {}
206 flatret_run[
"W"] = {}
209 histlist = [ret]
if flatten_cryo
else [ret[
"E"], ret[
"W"]]
210 makeflatlist = [flatret_run]
if flatten_cryo
else [flatret_run[
"E"], flatret_run[
"W"]]
212 for hists, makeflat
in zip(histlist, makeflatlist):
213 run0 =
list(hists.keys())[0]
215 for valname
in hists[run0].keys():
216 makeflat[valname] = {}
217 for whenname
in hists[run0][valname].keys():
218 makeflat[valname][whenname] = self.
_hadd(*[hists[runname][valname][whenname]
for runname
in hists.keys()])
223 if len(when) == 1
and not when[0]:
224 if flatten_runs
and flatten_cryo:
225 for varname
in ret.keys():
226 ret[varname] = ret[varname][
"None"]
228 for cname
in ret.keys():
229 for varname
in ret[cname].keys():
230 ret[cname][varname] = ret[cname][varname][
"None"]
232 for runname
in ret.keys():
233 for varname
in ret[runname].keys():
234 ret[runname][varname] = ret[runname][varname][
"None"]
236 for cname
in ret.keys():
237 for runname
in ret[cname].keys():
238 for varname
in ret[cname][runname].keys():
239 ret[cname][runname][varname] = ret[cname][runname][varname][
"None"]
244 Ns = [N
for N,_
in hs]
245 return np.sum(Ns, axis=0), hs[0][1]
248 for level
in [cryo, run, valname]:
249 if level
not in hist_dict:
250 hist_dict[level] = {}
251 hist_dict = hist_dict[level]
253 if whenname
not in hist_dict:
256 return hist_dict[whenname]
auto enumerate(Iterables &&...iterables)
Range-for loop helper tracking the number of iteration.
auto zip(Iterables &&...iterables)
Range-for loop helper iterating across many collections at the same time.