import matplotlib.pyplot as plt
import numpy as np
import scipy.integrate as integrate
import random
from scipy.optimize import curve_fit
from scipy.special import erfcinv
import scipy.stats as stats

# Problem 1
with open('LHC_data_2023.txt', 'r') as f:
    lines = (line.strip() for line in f if line)
    real_data = [float(line) for line in lines]

print(f"This file containes {len(real_data)} entries that range from {min(real_data)} Gev to {max(real_data)} Gev.")

def pdf_theory(x, constant):
    return 0*x + constant

# MC toy data for bump hunting
N_entries = len(real_data)
N_bins = int(N_entries/1000)

m_CSC = 30.0
sigma_CSC = 0.2
xsec_CSC = 0.01

bkg = np.random.uniform(20,40,int(N_entries*(1.0-xsec_CSC)))
th_signal = np.random.normal(m_CSC,sigma_CSC, int(N_entries*xsec_CSC))
total = np.concatenate((bkg, th_signal), axis=0)

counts_CSC,bin_edges_CSC = np.histogram(total,N_bins)
bin_centres_CSC = (bin_edges_CSC[:-1] + bin_edges_CSC[1:])/2.
err_CSC = np.sqrt(counts_CSC)

#Problem 2
def chi_2(func, const, data_x, data_y):
    chi2 = 0.
    for x,y in zip(data_x,data_y):
        chi2 += (func(x,const) - y)**2
    return chi2/len(data_x)

chi_2_histo = []

N_MC_experiments = int(1e4)
for i in range(N_MC_experiments):
    bkg = np.random.uniform(20,40,N_entries)
    counts,bin_edges = np.histogram(bkg,N_bins)
    bin_centres = (bin_edges[:-1] + bin_edges[1:])/2.

    chi_2_histo.append(chi_2(pdf_theory, N_entries/N_bins, bin_centres, counts))

weights = np.ones_like(chi_2_histo)/len(chi_2_histo)

N_bins_chi2 = 800
counts_test, bin_edges_test = np.histogram(chi_2_histo,N_bins_chi2)
bin_centres_test = (bin_edges_test[:-1] + bin_edges_test[1:])/2.

mu, sigma = stats.norm.fit(chi_2_histo)
x_pdf = np.linspace(0.,3*max(chi_2_histo),1000)
t_pdf = stats.norm.pdf(x_pdf, mu, sigma)
print(mu, sigma)

m = np.linspace(min(real_data), max(real_data), 1000)
pdf = pdf_theory(m, N_entries/N_bins)

figure, axis = plt.subplots(ncols=2,figsize=(18,8))
axis[0].errorbar(bin_centres_CSC, counts_CSC, yerr=err_CSC, fmt='o')
axis[0].set_ylim(0, 1.2*max(counts_CSC))
axis[0].set_title("Hypothetical signal with $m_{CSC}=$" + f"{m_CSC:.1f}GeV")
axis[0].set(xlabel='$m_{e^+e^-}$ [GeV]',ylabel='N events')
axis[0].plot(m,pdf)

t_obs_CSC = chi_2(pdf_theory, N_entries/N_bins, bin_centres_CSC, counts_CSC)
print("For $m_{CSC}=$"+f"{m_CSC} the observed test statistic would be t_obs = {t_obs_CSC:.2f}")
index = np.argmin(np.abs(np.array(bin_centres_test)-t_obs_CSC))
p_value_num = (sum(counts_test[index:N_bins_chi2])/sum(counts_test))
p_value_func = 1.0 - stats.norm.cdf(t_obs_CSC, mu, sigma)
print(stats.norm.cdf(t_obs_CSC, mu, sigma))
print(f"Calculated from histogram p value {p_value_num}")
print(f"Calculated from PDF p value {p_value_func:.20f}")

def significance(p_value):
    return np.sqrt(2)*erfcinv(2*p_value)

text = "Expected $t_{obs}$"+f"={t_obs_CSC:.2f}\n Expected p-value = {p_value_func:.8f}\n Expected significance = {significance(p_value_func):.2f}σ"
axis[1].hist(chi_2_histo, weights=weights, bins = N_bins_chi2)
axis[1].plot(x_pdf, t_pdf)
axis[1].axvline(x = t_obs_CSC, color = 'b', linestyle='--', label = '$t_{obs}$')
axis[1].set_title("Hypothesis testing")
axis[1].set(xlabel="t", ylabel="$g(t|H_0)$")
axis[1].text(max(chi_2_histo),0.004,text,fontsize=14)
axis[1].set_xlim([0, 3*max(chi_2_histo)])
plt.savefig("E4_Problem_1_2.pdf")
# plt.show()
plt.clf()

#Problem 3

counts_obs,bin_edges_obs = np.histogram(real_data,N_bins)
bin_centres_obs = (bin_edges_obs[:-1] + bin_edges_obs[1:])/2.
err_obs = np.sqrt(counts_obs)

m = np.linspace(min(real_data), max(real_data), 1000)
pdf = pdf_theory(m, N_entries/N_bins)

figure, axis = plt.subplots(ncols=2,figsize=(18,8))
axis[0].errorbar(bin_centres_obs, counts_obs, yerr=err_obs, fmt='o')
axis[0].set_ylim(0, 1.2*max(counts_obs))
axis[0].set_title("CMS data 2021-2023")
axis[0].set(xlabel='$m_{e^+e^-}$ [GeV]',ylabel='N events')
axis[0].plot(m,pdf)

t_obs = chi_2(pdf_theory, N_entries/N_bins, bin_centres_obs, counts_obs)
print(f"t_obs = {t_obs:.2f}")
p_value = p_value_func = 1.0 - stats.norm.cdf(t_obs, mu, sigma)
print(f"That corresponds to p value {p_value}")

text = "$t_{obs}$"+f"={t_obs:.2f}\n Observed p-value = {p_value:.8f}\n Observed significance = {significance(p_value):.2f}σ"
axis[1].hist(chi_2_histo, weights=weights, bins = N_bins_chi2)
axis[1].axvline(x = t_obs, color = 'b', linestyle='--', label = '$t_{obs}$')
axis[1].plot(x_pdf, t_pdf)
axis[1].set_title("Hypothesis testing")
axis[1].set(xlabel="t", ylabel="$g(t|H_0)$")
axis[1].text(max(chi_2_histo),0.004,text,fontsize=14)
axis[1].set_xlim([0, 3*max(chi_2_histo)])
plt.savefig("E4_Problem_3.pdf")
plt.show()
plt.clf()
