%pylab inline
import pandas as pd
import plotly
import seaborn as sns
sns.set_context("talk")
from matplotlib import cm
tab20 = cm.get_cmap('tab20')
Populating the interactive namespace from numpy and matplotlib
Dimensions | Matrixes |
---|---|
$N$: number of events (~100k) | |
$m$: number of input attributes (~10) | $X$: input matrix (N x m) |
$p$: number of target features (~1) | $Y$: matrix of targets (N x p) |
$q$: number of OPU input attributes (~1M) | $V$: matrix of OPU inputs (N x q) |
$r$: number of OPU random features (~10k) | $U$: matrix of random features (N x r) |
Track reconstruction at LHC as a collaborative data challenge use case with RAMP
Original paper : http://inspirehep.net/record/1616034
Original library : https://github.com/yetkinyilmaz/tracking
Python3 port : https://github.com/LAL/tracking2Dsim
Then ./runMany.sh
#!/bin/sh
for i in `seq 1439 2000`; do
python produce.py --seed $i --N 100 --M 10
done
part_cols=['event_id','cluster_id','pt', 'phi', 'xVtx', 'yVtx']
particles=pd.read_csv("data/particles_100_1439.csv", names=part_cols)
part_ev0=particles.query("event_id==143900")
part_ev0
event_id | cluster_id | pt | phi | xVtx | yVtx | |
---|---|---|---|---|---|---|
0 | 143900 | 0 | 3875.998795 | -3.113895 | 0.677386 | -0.103509 |
1 | 143900 | 1 | 7350.878984 | -1.019270 | -0.748584 | 0.954167 |
hits_cols=['event_id','cluster_id','layer','iphi', 'x', 'y']
hits=pd.read_csv("data/hits_100_1439.csv", names=hits_cols)
hits_ev0=hits.query("event_id==143900")
hits_ev0
event_id | cluster_id | layer | iphi | x | y | |
---|---|---|---|---|---|---|
0 | 143900 | 1 | 1 | 17898 | 44.550584 | -72.389540 |
1 | 143900 | 1 | 2 | 32651 | 81.526956 | -131.826990 |
2 | 143900 | 0 | 5 | 25801 | -404.613287 | -17.694297 |
3 | 143900 | 0 | 8 | 64177 | -997.739581 | -67.199164 |
4 | 143900 | 1 | 8 | 105688 | 541.313646 | -840.820752 |
5 | 143900 | 0 | 1 | 10792 | -84.955114 | -2.761992 |
6 | 143900 | 0 | 0 | 4951 | -38.979971 | -1.249757 |
7 | 143900 | 1 | 0 | 8207 | 20.331063 | -33.281344 |
8 | 143900 | 1 | 4 | 57118 | 143.210975 | -230.068722 |
9 | 143900 | 0 | 7 | 48759 | -760.727131 | -44.025352 |
10 | 143900 | 1 | 3 | 44881 | 112.294733 | -180.994179 |
11 | 143900 | 1 | 5 | 42704 | 215.023742 | -343.205172 |
12 | 143900 | 0 | 4 | 34474 | -270.797581 | -10.472350 |
13 | 143900 | 1 | 7 | 80461 | 409.395360 | -642.681445 |
14 | 143900 | 0 | 3 | 27078 | -212.857658 | -7.785727 |
See ./starting_kit/HEP_tracking_starting_kit.ipynb
The data provided to the user is a list of hit positions from a simple toy detector model that mimics the Atlas detector design (which is generic enough for recent silicon-based tracking detectors). The detector has an onion-like geometry with 9 layers surrounding the origin with polar distances of R = [39,85,155,213,271,405,562,762,1000] cm. These layers have a very small thickness compared to the distances, therefore the thickness can be neglected.
Each layer is segmented in azimuth with high granularity. There are ($2\pi$R/pitch)+1 pixels in every layer, where pitch is 0.025 cm for layers 0-4 and 0.05 cm for layers 5-9.
Every "pixel" corresponds to the smallest detector unit defined by layer and iphi (azimuthal index).
R=np.array([39,85,155,213,271,405,562,762,1000])
K=2*pi/0.025
gran=np.hstack([ K * np.ones(4) , K/2 * np.ones(5) ])
nseg=np.ceil(gran*R).astype(int)
nseg
array([ 9802, 21363, 38956, 53533, 34055, 50894, 70624, 95756, 125664])
nseg_cs=np.cumsum(nseg)
nseg_cs
array([ 9802, 31165, 70121, 123654, 157709, 208603, 279227, 374983, 500647])
nseg_loffset=np.roll(nseg_cs, 1)
nseg_loffset[0]=0
nseg_loffset
array([ 0, 9802, 31165, 70121, 123654, 157709, 208603, 279227, 374983])
dmdresolution=(912, 1140)
dmdtotpix=np.product(dmdresolution)
print("nbr DMD pixels : ", dmdtotpix)
print("2x nbr detector pixels :", 2*nseg_cs[-1])
print("Difference : ", dmdtotpix- 2*nseg_cs[-1] )
nbr DMD pixels : 1039680 2x nbr detector pixels : 1001294 Difference : 38386
dmdshape=(910, 1101)
dmdnpix=np.product(dmdshape)
fig, (ax1, ax2) = subplots(1, 2, figsize=(16, 8))
for ir, r in enumerate(R):
circle = Circle((0, 0), r, color=tab20(2*ir), fill=False, linewidth=10)
ax1.add_artist(circle)
axlim=1.1*R[-1]
ax1.set_xlim(-axlim, axlim)
ax1.set_ylim(-axlim, axlim)
ax1.axis("equal")
background=np.zeros(dmdnpix)
for i, (start, end) in enumerate(zip(2*nseg_loffset, 2*nseg_cs)):
background[start:end]=1+2*i
background=background.reshape(dmdshape)
ax2.matshow(background, cmap="tab20", vmin=0, vmax=19.5)
<matplotlib.image.AxesImage at 0x7fe2c3c37110>
for cluster, gp in hits_ev0.sort_values(['event_id','cluster_id', 'layer']).groupby("cluster_id"):
indices=2*(gp["iphi"]+ nseg_loffset[gp["layer"]]).values
bliparr=np.zeros(dmdnpix)
bliparr[indices]=True
fig, axes = plt.subplots(1, 2, sharey=True, figsize=(18, 8))
axes[0].matshow(background, cmap="tab20", vmin=0, vmax=19.5)
axes[1].matshow(bliparr.reshape(dmdshape), cmap="Blues")
fig.tight_layout()
indices
array([ 16414, 55400, 127632, 230004, 361544, 400826, 719376, 961342])
(Iacopo Poli, personal communication)
if (len(indices)%2==1): indices=np.append(indices, -1)
arr=ones(dmdnpix)
for row in indices.reshape(-1, 2):
arr[row[0]:row[1]]=False
fig, axes = plt.subplots(1, 2, sharey=True, figsize=(18, 8))
axes=axes.flatten()
axes[0].matshow(background-arr.reshape(dmdshape), cmap="tab20", vmin=0, vmax=19.5)
axes[1].matshow(arr.reshape(dmdshape), cmap="Blues")
fig.tight_layout()
from glob import glob
from tqdm import tqdm
from os.path import exists
pbar=tqdm(glob("data/hits*.csv"))
for file in tqdm(pbar):
break
events=file[14:18]
if exists("opu_switch2/swi"+events+".npz"):
continue
pbar.set_description("Evt " + events)
hits=pd.read_csv(file, names=['event_id','cluster_id','layer','iphi', 'x', 'y'])
clusters={}
for event, egp in hits.sort_values(['event_id','cluster_id', 'layer']).groupby("event_id"):
for cluster, gp in egp.sort_values(['event_id','cluster_id', 'layer']).groupby("cluster_id"):
arr=np.ones((910*1101))
indices=2*(gp["iphi"]+ nseg_loffset[gp["layer"]]).values
if (len(indices)%2==1): indices=np.append(indices, -1)
indices=indices.reshape(-1, 2)
for row in indices:
arr[row[0]:row[1]]=False
arr=arr.reshape(910,1101)
clusters[str(event)+"_"+str(cluster)]=arr
np.savez_compressed("opu_switch2/swi"+events, **clusters)
0%| | 0/563 [00:00<?, ?it/s] 0%| | 0/563 [00:00<?, ?it/s]
from os.path import exists
X_train=np.array([])
dfy=pd.DataFrame()
for prefile in range(1400, 1900):
sp=str(prefile)
fy="data/particles_100_"+sp+".csv"
fopu="switchoutput/trackml_opu_X_train_30k_2Dswitch_"+sp+".npy.npz"
if not ( exists (fy) & exists(fopu)): continue
Xt=np.load(fopu)['train']
dftmp=pd.read_csv(fy, names=['event','particle','pt', 'phi', 'xVtx', 'yVtx'])
if (len(Xt) != len(dftmp)) :
continue
X_train=Xt if len(X_train)==0 else np.concatenate([X_train, Xt])
dfy=pd.concat([dfy, dftmp])
print(np.shape(X_train), len(dfy))
(62555, 30000) 62555
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from time import time
def Xridge(X, y, gamma=1):
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2)
ridge=Ridge()
start = time()
ridge.fit(Xtr, ytr)
end = time()
ypr=ridge.predict(Xte)
return ypr, yte.values, end-start
n_feats=5000
choice=np.random.choice(30000, n_feats)
X_choice=X_train[:, choice]
y_pred, y_true, timing=Xridge(X_choice, dfy[["pt", "phi"]])
figsize(12, 12)
scatter(y_true[:, 0], y_pred[:, 0], alpha=0.5, s=20)
xylim=10000
plot( (-xylim, xylim), (-xylim, xylim) , color="red")
xlabel("True value")
ylabel("Prediction")
title("$P_T$ estimate from OPU ridge regression, 5K features")
axis("equal")
print(timing, "s")
10.475732326507568 s
scatter(y_true[:, 1], y_pred[:, 1], alpha=0.5, s=20)
xylim=pi
plot( (-xylim, xylim), (-xylim, xylim) , color="red")
xlabel("True value")
ylabel("Prediction")
title("$\phi$ estimate from OPU ridge regression, 5K features")
axis("equal")
(-3.459043566740272, 3.4593753916708403, -5.231807773130645, 5.521512745768216)
dfy["rPt"]= 1/dfy["pt"]
y_pred, y_true, timing=Xridge(X_choice, dfy.rPt)
scatter(y_true, y_pred, alpha=0.5, s=20)
xylim=0.004
plot( (-xylim, xylim), (-xylim, xylim) , color="red")
xlabel("True value")
ylabel("Prediction")
title("1/Pt estimate from OPU ridge regression, 5K features")
axvline(1e-4, c="g")
axvline(-1e-4, c="g")
annotate("$1 / P_T^{max}} = 1/10 000$", xy=(1e-4, 3e-3), xytext=(5e-4, 3e-3), color="g", arrowprops=dict(arrowstyle="->", color='green'))
axis("equal")
xlim(-xylim, xylim)
ylim(-xylim, xylim)
(-0.004, 0.004)
dfp=pd.DataFrame.from_dict({"true": y_true, "pred": y_pred})
dfp["dif"]=dfp.pred-dfp.true
dfp.head()
true | pred | dif | |
---|---|---|---|
0 | 0.000509 | 0.000451 | -0.000058 |
1 | -0.000195 | 0.000168 | 0.000363 |
2 | -0.000101 | 0.000205 | 0.000306 |
3 | -0.000133 | 0.000323 | 0.000456 |
4 | 0.000320 | 0.000029 | -0.000291 |
scatter(dfp.true, dfp.dif, alpha=0.5, s=20)
xylim=0.004
xlabel("$y_{true}$")
ylabel("$y_{pred} - y_{true}$")
title("1/Pt estimate from OPU ridge regression, 5K features")
axvline(1e-4, c="g")
axvline(-1e-4, c="g")
annotate("$1 / P_T = 1/10 000$", xy=(1e-4, 3e-3), xytext=(5e-4, 3e-3), color="g", arrowprops=dict(arrowstyle="->", color='green'))
xlim(-xylim, xylim)
ylim(-xylim, xylim)
(-0.004, 0.004)
from scipy import signal
figsize(15, 6)
window=dfp["dif"].rolling(50, center=True)
xquart=dfp["dif"].sort_values().values
lquart=window.quantile(.25, interpolation='midpoint')
hquart=window.quantile(.75, interpolation='midpoint')
mquart=window.quantile(.50, interpolation='midpoint')
xquart=xquart[~numpy.isnan(mquart)]
lquart=lquart[~numpy.isnan(lquart)]
hquart=hquart[~numpy.isnan(hquart)]
mquart=mquart[~numpy.isnan(mquart)]
lsmooth=signal.savgol_filter(lquart, 5001, 3)
hsmooth=signal.savgol_filter(hquart, 5001, 3)
msmooth=signal.savgol_filter(mquart, 5001, 3)
plot(xquart, lquart, ".", alpha=0.2, color=tab20(1))
plot(xquart, lsmooth, color=tab20(0))
plot(xquart, mquart, ".", alpha=0.2, color=tab20(3))
plot(xquart, msmooth, color=tab20(2))
plot(xquart, hquart, ".", alpha=0.2, color=tab20(5))
plot(xquart, hsmooth, color=tab20(4))
xlabel("$y_{true}$")
ylabel("$y_{pred} - y_{true}$")
Text(0, 0.5, '$y_{pred} - y_{true}$')
figsize(12, 12)
scatter(dfp.true, dfp.dif, alpha=0.5, s=20, label="data")
xylim=0.004
xlabel("$y_{true}$")
ylabel("$y_{pred} - y_{true}$")
title("1/Pt estimate from OPU ridge regression, 5K features")
axvline(1e-4, c="g")
axvline(-1e-4, c="g")
annotate("$1 / P_T = 1/10 000$", xy=(1e-4, 3e-3), xytext=(5e-4, 3e-3), color="g", arrowprops=dict(arrowstyle="->", color='green'))
xlim(-xylim, xylim)
ylim(-xylim, xylim)
plot(xquart, msmooth, color="red", label="median")
fill_between(xquart, lsmooth, hsmooth, alpha=0.2, color="red", label="IQR")
legend()
<matplotlib.legend.Legend at 0x7fe2b9eb9dd0>
scatter(dfp.true, dfp.dif, alpha=0.5, s=20, label="data")
xlabel("$y_{true}$")
ylabel("$y_{pred} - y_{true}$")
title("1/Pt estimate from OPU ridge regression, 5K features")
axvline(1e-4, c="g")
axvline(-1e-4, c="g")
xlim(-0.0025, 0.0025)
ylim(-0.001, 0.001)
plot(xquart, msmooth, color="red", label="median")
fill_between(xquart, lsmooth, hsmooth, alpha=0.2, color="red", label="IQR")
legend()
<matplotlib.legend.Legend at 0x7fe2ba25f490>
figsize(15, 5)
ax=sns.distplot(dfp.dif, hist=False, rug=True)
ax.set_xlim(-1e-3, 1e-3)
(-0.001, 0.001)
from scipy.stats import iqr
fitres=[{"n_feats":n_feats,
"ytrue": y_true,
"ypred": y_pred,
"ydiff": y_pred - y_true,
"time" : timing,
"std": std(y_pred - y_true),
"iqr": iqr(y_pred - y_true)
}]
def appendRidge(n_feats):
dic={}
choice=np.random.choice(30000, n_feats)
X_choice=X_train[:, choice]
ypr, yte, timing=Xridge(X_choice, dfy.rPt)
dic["n_feats"]=n_feats
dic["ytrue"]=yte
dic["ypred"]=ypr
dic["ydiff"]=ypr-yte
dic["time"]=timing
dic["std"]=std(ypr-yte)
dic["iqr"]=iqr(ypr-yte)
fitres.append(dic)
xs=[x["n_feats"] for x in fitres]
ys=[x["iqr"] for x in fitres]
ts=[x["time"] for x in fitres]
text=["{:.3f} s".format(x["time"]) for x in fitres]
import plotly.graph_objects as go
marker_prop=dict(colorscale="Viridis", size=12, showscale=True, cmin=0, cmax=10, color=ts, colorbar={"title":"Time (s)"})
data=go.Scatter(x=xs, y=ys, hovertext=text, mode="markers", marker=marker_prop, hoverinfo="all")
fig = go.FigureWidget(data=[data])
fig.update_xaxes(range=[0.9, 4.5], type="log")
fig.update_yaxes(range=[0, 6.1e-4])
fig.update_layout(height=600, title_font_size=20, title_text="Ridge regression fit precision vs N random features ",
xaxis_title="Number of random features", yaxis_title="IQR", xaxis_title_font=dict(size=20), yaxis_title_font=dict(size=20))
trace= fig.data[0]
fig
for n_feats in np.logspace(1, log10(20000), 20, dtype=int):#[::-1]:
appendRidge(n_feats)
xs=[x["n_feats"] for x in fitres]
ys=[x["iqr"] for x in fitres]
ts=[x["time"] for x in fitres]
text=["{:.3f} s".format(x["time"]) for x in fitres]
trace.x = xs
trace.y = ys
trace.marker.color=ts
trace.hovertext=text
print(n_feats)
10 14 22 33 49 73 110 164 245 366 546 814 1215 1813 2706 4037