diff --git a/.gitignore b/.gitignore index 7bef8a5..6b5be3d 100644 --- a/.gitignore +++ b/.gitignore @@ -302,4 +302,4 @@ TSWLatexianTemp* # precommit autofixes?? -*.bak0 +*.bak* diff --git a/Images/DQM GUI.png b/Images/DQM GUI.png new file mode 100644 index 0000000..9fc10e1 Binary files /dev/null and b/Images/DQM GUI.png differ diff --git a/Images/DQM Workflow.png b/Images/DQM Workflow.png new file mode 100644 index 0000000..d4eeaa0 Binary files /dev/null and b/Images/DQM Workflow.png differ diff --git a/Images/Loss_function1.png b/Images/Loss_function1.png new file mode 100644 index 0000000..95bdc3f Binary files /dev/null and b/Images/Loss_function1.png differ diff --git a/Images/Loss_function2.png b/Images/Loss_function2.png new file mode 100644 index 0000000..1c924c4 Binary files /dev/null and b/Images/Loss_function2.png differ diff --git a/Images/MLP.png b/Images/MLP.png new file mode 100644 index 0000000..fa2aa17 Binary files /dev/null and b/Images/MLP.png differ diff --git a/Images/OMS.png b/Images/OMS.png new file mode 100644 index 0000000..146673e Binary files /dev/null and b/Images/OMS.png differ diff --git a/Images/Project_Reco.png b/Images/Project_Reco.png new file mode 100644 index 0000000..c684d6a Binary files /dev/null and b/Images/Project_Reco.png differ diff --git a/Images/RR.png b/Images/RR.png new file mode 100644 index 0000000..f2e3680 Binary files /dev/null and b/Images/RR.png differ diff --git a/Images/certhelp-cert.png b/Images/certhelp-cert.png new file mode 100644 index 0000000..a2ae4f1 Binary files /dev/null and b/Images/certhelp-cert.png differ diff --git a/Images/certhelper-list.png b/Images/certhelper-list.png new file mode 100644 index 0000000..d0bf33b Binary files /dev/null and b/Images/certhelper-list.png differ diff --git a/Images/certhelper-menu.png b/Images/certhelper-menu.png new file mode 100644 index 0000000..cc7ad0e Binary files /dev/null and b/Images/certhelper-menu.png differ diff --git a/Images/certhelper-portal.png b/Images/certhelper-portal.png new file mode 100644 index 0000000..5a0fbcd Binary files /dev/null and b/Images/certhelper-portal.png differ diff --git a/Images/pdfs/Ratio histograms.pdf b/Images/pdfs/Ratio histograms.pdf new file mode 100644 index 0000000..bbd5da0 Binary files /dev/null and b/Images/pdfs/Ratio histograms.pdf differ diff --git a/Images/pdfs/Testing Ratios.pdf b/Images/pdfs/Testing Ratios.pdf new file mode 100644 index 0000000..8c4e2cf Binary files /dev/null and b/Images/pdfs/Testing Ratios.pdf differ diff --git a/Images/pdfs/fits_16-APV-HIPM.pdf b/Images/pdfs/fits_16-APV-HIPM.pdf new file mode 100644 index 0000000..7149f8f Binary files /dev/null and b/Images/pdfs/fits_16-APV-HIPM.pdf differ diff --git a/Images/pdfs/fits_16.pdf b/Images/pdfs/fits_16.pdf new file mode 100644 index 0000000..a8bea1b Binary files /dev/null and b/Images/pdfs/fits_16.pdf differ diff --git a/Images/pdfs/fits_17.pdf b/Images/pdfs/fits_17.pdf new file mode 100644 index 0000000..7b4774a Binary files /dev/null and b/Images/pdfs/fits_17.pdf differ diff --git a/Images/pdfs/fits_18.pdf b/Images/pdfs/fits_18.pdf new file mode 100644 index 0000000..0a6f24d Binary files /dev/null and b/Images/pdfs/fits_18.pdf differ diff --git a/Images/pdfs/fits_closeup_16-APV-HIPM.pdf b/Images/pdfs/fits_closeup_16-APV-HIPM.pdf new file mode 100644 index 0000000..c5cb47e Binary files /dev/null and b/Images/pdfs/fits_closeup_16-APV-HIPM.pdf differ diff --git a/Images/pdfs/fits_closeup_16.pdf b/Images/pdfs/fits_closeup_16.pdf new file mode 100644 index 0000000..574e301 Binary files /dev/null and b/Images/pdfs/fits_closeup_16.pdf differ diff --git a/Images/pdfs/fits_closeup_17.pdf b/Images/pdfs/fits_closeup_17.pdf new file mode 100644 index 0000000..73ab243 Binary files /dev/null and b/Images/pdfs/fits_closeup_17.pdf differ diff --git a/Images/pdfs/fits_closeup_18.pdf b/Images/pdfs/fits_closeup_18.pdf new file mode 100644 index 0000000..08538dd Binary files /dev/null and b/Images/pdfs/fits_closeup_18.pdf differ diff --git a/Images/pdfs/modelTB.pdf b/Images/pdfs/modelTB.pdf new file mode 100644 index 0000000..5590087 Binary files /dev/null and b/Images/pdfs/modelTB.pdf differ diff --git a/Images/ranking.png b/Images/ranking.png new file mode 100644 index 0000000..c6ab1a6 Binary files /dev/null and b/Images/ranking.png differ diff --git a/chapters/Chapter02.tex b/chapters/Chapter02.tex index 1327d6e..b9833f1 100644 --- a/chapters/Chapter02.tex +++ b/chapters/Chapter02.tex @@ -41,9 +41,9 @@ \chapter{The CMS Detector\label{ch:CMS}} \autoref{CMSLayers} depicts the particle detection process in CMS. Charged particles leave signatures in the inner tracking system, and the vertices from decaying short-lived particles can be identified. Photons, electrons, neutral pions and kaons are stopped in the crystals of the electromagnetic calorimeter (ECAL) and the scintillation light is used to determine the deposited energy. Hadrons punch through further and are generally stopped by the hadronic calorimeter (HCAL), where jets are confined and only the highest-energy hadrons and muons pass through the superconducting solenoid into the outer regions of the CMS barrel. Finally, muons are detected in the various muon detectors which interleave the return yoke of the magnet. Neutrinos escape from the CMS detector and are inferred from an imbalance of energy in the reconstructed event called missing transverse energy (MET or $\vec{p}_T^{\text{miss}}$). More detailed descriptions of the CMS detector, together with a definition of the coordinate system used and the relevant kinematic variables, can be found in Refs.~\cite{CMS:2008xjf,CMS:2023gfb}. -\begin{figure} +\begin{figure}[h] \centering - \includegraphics[width=\linewidth]{CMSLayers.png} + \includegraphics[width=.9\linewidth]{CMSLayers.png} \caption[Particle trajectories and footprint in CMS]{The trajectory of a particle traveling through the layers of the detector leaving behind it's signature footprint\label{CMSLayers}} \end{figure} diff --git a/chapters/Chapter03.tex b/chapters/Chapter03.tex index e21a068..b7cd7ec 100644 --- a/chapters/Chapter03.tex +++ b/chapters/Chapter03.tex @@ -3,77 +3,158 @@ \chapter{Emerging Jets (EJs) \label{ch:emj}} \section{Background information on EJs} -The Emerging Jets concept arises from the paper by P. Schwaller \cite{Schwaller:2015gea} where it was proposed to search for the Emerging Jets signature in the Run 1 dataset of the LHC Experiments to set limits on a combination of parameter ranges. Many studies of Dark matter require new physics that is beyond the Standard Model of Particle Physics (BSM) and objects such as weakly interacting massive particles (WIMPs) have not been fruitfull in this regard. +Many studies of dark matter require new physics that are beyond the Standard Model of Particle Physics (BSM) and searches with objects such as weakly interacting massive particles (WIMPs) have not been fruitful in this regard. +A class of models that includes electrically neutral fermions called “dark quarks” ($Q_{DK}$) that are not charged under the forces of SM but are charged under a new force in the dark sector (``dark QCD''), has properties similar to quantum chromodynamics (SM QCD). +These models naturally explain the observed mass densities of matter and dark matter. + +The emerging jets concept arises from \cite{Schwaller:2015gea} where it was proposed to search for a new signature in the Run 1 dataset of the LHC Experiments and set limits on a combination of parameter ranges. The EJs model is a dark matter model that assumes that there is a QCD-like hidden sector. In particular, in these high-energy collisions, a heavy dark mediator ($X_{DK}$) is produced with a mass on the $\order{\text{TeV}}$, decaying into dark hadrons and mesons that further decay into SM particles. +Due to the hierarchy of GeV to TeV energy scales (see \cref{fig:dark-qcdmodel}), the decay process allows for dark matter particles to travel a measurable distance before decaying. +In \cref{fig:emj_production1} we see the production processes of the EJs signature. There are 2 ways of producing EJs in the LHC. First, is through gluon-gluon fusion and second is from quark anti-quark annihilation. +Both of these produce a pair of heavy dark mediators, each then decays into an SM quark (\textit{q}) and a dark quark (\Qdark). +Further on, we see from \cref{fig:full-chain} that the \Qdark will decay into \pidark. +Since these dark pions are unstable and do not carry a dark baryon number, they then decay after some measurable distance into SM particles\cite{Bai_2014} and form SM jets that we can detect. \begin{figure} \centering - \includegraphics[width=.58\linewidth]{emj_detector.png} - \caption[Illustration of the emerging jets forming in the CMS detector]{An illustration of the pair production of dark quarks forming two emerging jets. The dark mesons are represented by dashed lines as they do not interact with the detector. After traveling some distance, each individual dark pion decays into Standard Model particles, creating a small jet represented by solid colored lines. Because of the exponential decay, each set of SM particles originates at a different distance from the interaction point, so the jet slowly emerges into the detector. } - \label{fig:2emj_inCMS} + \includegraphics[width=0.8\linewidth]{Images/DarkQCDModel.png} + \caption[The hierarchy of the GeV to TeV scales.]{The hierarchy of the GeV to TeV scales. In this model the dark scalar mediator $X_d$ couples to both dark and SM sectors. Reprinted from \cite{Schwaller:2015gea}} + \label{fig:dark-qcdmodel} \end{figure} -The full Run 2 dataset is used in the latest search for emerging jets \cite{CMS:2024gxp} accumulating 138 \unit{\per\femto\barn} to search for this signature. - +\begin{figure} + \begin{center} + \begin{subfigure}{.45\linewidth} + \includegraphics*[width=\linewidth]{pdfs/BSSWPairProduction_ggFusion.pdf} + \caption{gluon-gluon fusion} + \end{subfigure} + \begin{subfigure}{.45\linewidth} + \includegraphics*[width=\linewidth]{pdfs/BSSWPairProduction_qqAnnihilation.pdf} + \caption{quark anti-quark annihilation} + \end{subfigure} + \end{center} + \caption[Emergin jets production modes]{Feynman diagrams for pair production of dark mediator particles, with mediators decay to an SM quark and a dark quark. The bar ($-$) over the quark symbols signify that they are anti-particles, as is the dagger ($\dagger$) over the \Mdark.} + \label{fig:emj_production1} +\end{figure} +\begin{figure} + \centering + \includegraphics[width=.8\linewidth]{Images/EMJ_production.png} + \caption{Example of the full chain of one production mode.} + \label{fig:full-chain} +\end{figure} \begin{figure} \centering \begin{subfigure}{.45\linewidth} \includegraphics*[width=\textwidth]{pdfs/FlavoredSchematicOfEvent.pdf} \caption{Flavor aligned} + \label{fig:emj_prod2A} \end{subfigure} \begin{subfigure}{.45\linewidth} \includegraphics*[width=\textwidth]{pdfs/UnflavoredSchematicOfEvent.pdf} \caption{Unflavored} \end{subfigure} - \caption{The Emerging Jets event models} - \label{fig:emj_production} + \caption[Shorter version of the production modes for the Emerging Jets models.]{Shorter version of the production modes for the Emerging Jets models. On the left, we show the flavor-aligned model where all \Qdark couple to down-type SM quarks only (d,s,b). + This model has \pidark variable lifetimes (\ctaudpi ) which depend on their composition and the Yukawa coupling constant ($\kappa$) between the mediator particle, the dark quarks, and the SM down quark. This parameter represents the lifetime of each track inside emerging jets. + On the right, we show the simpler unflavored model. This produces \Qdark that couple to the down-quark only and all \pidark lifetimes are the same.} + \label{fig:emj_production2} +\end{figure} + + +% Add explanation on two EMJ models +In the experimental searches \cite{sirunyan2019search,CMS:2024gxp} the TeV scale has been explored searching for the EJs signature. +The full Run 2 dataset is used in the latest search for emerging jets using the data collected by the CMS Collaboration in 2016--2018 using pp collisions at a center-of-mass energy of 13 TeV accumulating 138\unit{\per\femto\barn} worth of data\cite{CMS:2024gxp}. +In the latest search, we studied the models described in \cite{Bai_2014,Schwaller:2015gea,Renner_2018} in which we expanded on the phase space searched in \cite{sirunyan2019search} and included the second model portrayed in \cref{fig:emj_prod2A} that allows for the \Qdark to couple to all down-type quarks. In this model, each dark subcomponent (or \pidark) within the dark jet can subsequently decay into standard model particles at different distances along the jet axis. +Here, lifetimes for the dark mesons are dependent on the \Qdark composition and the Yukawa coupling constant ($\kappa$) between the dark quarks and the SM down quark. +In the unflavored model, all \Qdark are degenerate, while in the flavored model, three dark quark flavors with non-degenerate couplings are considered. +For the unflavored model, the average decay length of a dark pion is given by \cref{eq:unflavored-ctau} +\begin{equation} + \ctaudpi = 80~\unit{mm} \pgroup{\frac{1}{\kappa^4}} \pgroup{\frac{2 ~\unit{GeV} }{f_{\pidark}}}^2 \pgroup{\frac{100 ~\unit{MeV}}{m_\text{d}} }^2 \pgroup{\frac{2~\unit{GeV}}{m_{\pidark}}} \pgroup{\frac{m_{\Mdark}}{1~\unit{TeV}}}^4 + \label{eq:unflavored-ctau} +\end{equation} +where $f_{\pidark}$ is the dark pion decay constant, $m_\text{d}$ is the mass of the SM down quark, and $m_{\pidark}$ is the dark pion mass. In the flavored aligned model, the coupling constant is now a matrix $\kappa_{\alpha i}$ where the subscript $\alpha ~(i)$ denotes flavors of dark (SM) quarks. In this case, the average decay length for dark mesons is given by \cref{eq:flavored-ctau} +\begin{equation} + \small + \ctaudpi^{\alpha \beta} = \dfrac{8\pi m^4_{\Mdark}}{ N_c m_{\pidark} f^2_{\pidark} \displaystyle \sum_{i,j} \abs{\kappa_{\alpha i} \kappa_{\beta j}^*}^2 \pgroup{m_i^2 + m_j^2} \sqrt{ \pgroup{1- \dfrac{(m_i^2 + m_j^2)^2 }{m^2_{\pidark}} } \pgroup{1- \dfrac{(m_i^2 - m_j^2)^2 }{m^2_{\pidark}} } } } + \label{eq:flavored-ctau} +\end{equation} +where $m_{\Mdark}$ is the mediator mass, $N_c$ is the SM color factor and $m_i, m_j$ are the masses of the SM quarks with flavor indices $i, j$, respectively\cite{CMS:2024gxp}. +\Cref{fig:lifetimes} shows the different $c\tau$ for a given $m_{\pidark}$ based on the \pidark composition in the flavor-aligned model. In general, the lifetime of the dark pions goes down as their mass increases, as opposed to the unflavored model where the lifetimes are the same for all \pidark. +\begin{figure}[b] + \centering + \includegraphics[width=.65\linewidth]{Images/pdfs/FlavoredLifetime.pdf} + \caption[Lifetimes of the dark pions as a function of their mass.]{Lifetime of the \pidark as a function of the $m_{\pidark}$ in the flavor-aligned model. The jumps in the plot are indications of new energy states becoming available.} + \label{fig:lifetimes} +\end{figure} + +This latest search makes use of the increased amount of data, and the introduction of machine learning techniques to search for emerging jets. +We can see from \cref{fig:decay-distance} that this model has some distribution of the decay lengths in the order $\order{\unit{mm} - \unit{m}}$. The CMS detector is hence, sensitive to this phenomenon. +\Cref{fig:2emj_inCMS} illustrates how two EJs would look in a detector. It is assumed that from the high-energy collisions, dark quarks with enough energy hadronize and decay into dark mesons like pions ($\pi_{Dk}$). In the detector the SM quarks hadronize to produce SM jets and the dark quarks would also hadronize to make dark jets. The main signature for the analysis is to look for events that have high event energy, in particular in the form of a quantity known as $H_T$. The $H_T$ of an event is defined as the scalar sum of the transverse momentum ($\Vec{p}_T$) of the 4 leading jets (2 EJs, 2 SM jets). +\begin{equation} + H_T = \sum_{i=1}^4 p_T^i +\end{equation} + +\begin{figure}[tb] + \centering + \includegraphics[width=.75\linewidth]{Images/Decay-distances-emj.png} + \caption[2D distributions of decay distances and $\pi_{DK}$ momentum]{2D distribution between the decay distances and the $\pi_{DK}$ momentum. Reprinted from \cite{Schwaller:2015gea}} + \label{fig:decay-distance} \end{figure} +The free parameter ranges that are studied are: +\begin{itemize} + \item $m_{\Mdark} \in [1,2.5]$ TeV + \item $m_{\pidark} \leq 20$ GeV + \item $\ctaudpi \leq 500$ mm +\end{itemize} + + \begin{figure} - \begin{center} - \begin{subfigure}{.45\linewidth} - \includegraphics*[width=\linewidth]{pdfs/BSSWPairProduction_ggFusion.pdf} - \caption{gluon-gluon fusion} - \end{subfigure} - \begin{subfigure}{.45\linewidth} - \includegraphics*[width=\linewidth]{pdfs/BSSWPairProduction_qqAnnihilation.pdf} - \caption{quark anti-quark annihilation} - \end{subfigure} - \end{center} - \caption[Emergin jets production modes]{Feynman diagrams for pair production of dark mediator particles, with mediators decay to an SM quark and a dark quark} + \centering + \includegraphics[width=.58\linewidth]{emj_detector.png} + \caption[Illustration of the emerging jets forming in a detector]{An illustration of the pair production of dark quarks forming two emerging jets. Dashed lines represent the dark mesons as they do not interact with the detector. After traveling some distance, each dark pion decays into Standard Model particles, creating a small jet represented by solid colored lines. Because of the exponential decay, each set of SM particles originates at a different distance from the interaction point, so the jet slowly emerges into the detector. Figure and description adapted from \cite{Schwaller:2015gea}} + \label{fig:2emj_inCMS} \end{figure} -In figure \ref{fig:emj_production} we see the production process of the emerging jets signature. \clearpage \section{Trigger Efficiency and Scale Factor studies} -With a beam spacing of 25~ns, beam crossings occur in the CMS detector at a rate of 40 million per second 40\unit{\MHz}. -An additional complication is the approximately 25 interactions (referred to as `pileup', which is currently already much higher than the design value of 25) which occur with each beam crossing -- thus giving 1 billion events occurring in the CMS detector every second. In order to extract physics from these interactions it is vital to have fast electronics and very good resolution (proton-proton interactions are very messy and produce hundreds or thousands of particle candidates) and, because these events occur far too quickly to all be recorded and would take up vast amounts of disk space to store what are, for the majority, uninteresting events, very precise ``triggering'' is required. +With a beam spacing of 25\unit{ns}, beam crossings occur in the CMS detector at a rate of 40 million per second 40\unit{\MHz}. +An additional complication is the approximately $>$25 interactions (``pileup'') that occur with each beam crossing -- thus giving 1 billion events occurring in the CMS detector every second. To extract physics from these interactions it is vital to have fast electronics and good resolution (proton-proton interactions are very messy and produce hundreds or thousands of particle candidates) and, because these events occur far too quickly to all be recorded and would take up vast amounts of disk space to store what are, for the majority, uninteresting events, very precise ``triggering'' is required. -Events of interest are selected using a two-tiered trigger system. The first level (L1), composed of custom hardware processors, uses information from the calorimeters and muon detectors to select events at a rate of around 100~\unit{kHz} within a fixed latency of 4\unit{\us} ~\cite{CMS:2020cmk}. The second level, known as the high-level trigger (HLT), consists of a farm of processors running a version of the full event reconstruction software optimized for fast processing, and reduces the event rate to around 1~\unit{kHz} before data storage~\cite{CMS:2016ngn}. +Events of interest are selected using a two-tiered trigger system. The first level (L1), composed of custom hardware processors, uses information from the calorimeters and muon detectors to select events at a rate of around 100~\unit{kHz} within a fixed latency of 4~\unit{\us} \cite{CMS:2020cmk}. The second level, known as the high-level trigger (HLT), consists of a farm of processors running a version of the full event reconstruction software optimized for fast processing and reduces the event rate to around 1\unit{kHz} before data storage~\cite{CMS:2016ngn}. - -The $H_T$ triggers chosen are the triggers with the lowest online $H_T$ threshold that are not pre-scaled. The configurations used for this analysis are: +There are multiple types of triggers, and each will determine the kind of physics dataset that the data will be classified under. There are a few main datasets used to classify physics data, the ones relevant to the analysis are \emph{JetHT} for 2016-2018 and \emph{SinglePhoton}/\emph{EGamma} for 2016-2017/2018. These are considered to be orthogonal datasets as we do not expect a large overlap of the physics process that take place in them. Each category\footnote{also called a data stream} is comprised of an exhaustive list of ``trigger paths'' that are executed to decide what more specific conditions or subprocesses have taken place to record the events. +The $H_T$ triggers chosen are the triggers with the lowest online $H_T$ threshold that are not pre-scaled. The configurations used for this analysis for the \textit{JetHT} data stream are: \begin{itemize} \item \verb|HLT_PFHT900_v* OR HLT_PFJet450_v*| for 2016. The addition of a jet trigger path in an OR configuration is the recommended path to mitigate an observed inefficiency at high values of $H_T$ caused by the Level-1 trigger firmware issues for 2016. \item \verb|HLT_PFHT1050_v*| for 2017 and 2018. \end{itemize} -The calculation of the trigger efficiency is carried out by using the orthogonal \verb|HLT_Mu50_v*| trigger as the reference trigger. To determine the offline HT threshold at which the trigger can be considered to be fully efficient was estimated by fitting the trigger efficiency as a function of $H_T$ to an error function (erf) and an algebraic function ($f$): +The paths used for the \textit{SinglePhoton} data stream are: +\begin{itemize} + \item \verb|Photon165_HE10_v*| for 2016/2016 HIPM + \item \verb|Photon200_v*| for 2017/2018 +\end{itemize} + +We also have a simulation equivalent of these main datasets where we have access to the ground truth (Generator-Level) information of the simulated events, called Monte Carlo (MC) simulations. +Though not directly used to draw the conclusions of the search, SM MC samples are used to develop the analysis strategy. The SM multi-jet MC samples are used as a stand-in for expected background events in the \textit{JetHT} data and are used for event/object selection optimization, closure tests and evaluation of uncertainties. The $\gamma$+jets MC samples are used as a stand-in for the \textit{SinglePhoton} data stream, used for template histogram generation, and are compared with the multi-jet MC to evaluate uncertainties associated with \textit{JetHT}/\textit{SinglePhoton} environment differences\cite{CMS:2024gxp}. + + +The calculation of the trigger efficiency is carried out by using the orthogonal \verb|HLT_Mu50_v*| trigger as the reference trigger. The offline $H_T$ threshold at which the trigger can be considered to be fully efficient was estimated by fitting the trigger efficiency as a function of $H_T$ to an error function (erf) and an algebraic function ($f$): \begin{align} \text{erf}(H_T ;\ A,B,C) & = \frac A2 \left[1+ \text{erf}\left(\dfrac{H_T - |B|}{C}\right) \right]\label{eq:erf} \\ f(H_T ;\ A,B,C,D) & = A \dfrac{\frac{H_T - B}{C}}{1+ \left(\frac{H_T - B}{C}\right)^2} + D \label{eq:alg} \end{align} -Where \eqref{eq:erf} and \eqref{eq:alg} are modeled after the sigmoid-like functions +Where \cref{eq:erf,eq:alg} are modeled after the sigmoid-like functions \[ \text{erf}(x) = \frac{2}{\sqrt{\pi}} \int_0^x e^{-t^2} \dd{t} \] @@ -82,14 +163,209 @@ \section{Trigger Efficiency and Scale Factor studies} \] -The fit result is used to determine the threshold at which the $H_T$ trigger is expected to reach 99\% of their plateau value. This is also to assist in the termination of the offline HT cut applied to signal event selection, to make sure that signal events are not impacted too much by the trigger turn on effects. Figures 4.1 (a-d), shows the trigger efficiency as a function of event HT evaluated in the 4 data collection eras using the JetHT data stream compared with QCD MC along with an estimate of the trigger plateau value. -More specifically Figures 4.1 (a-d) compare of efficiency for HT trigger as a function of event HT measured relative to \verb|HLT_Mu50_v*| in data (black) and QCD MC (gray) and fit to the algebraic function f (line). The scale factor values used for signal MC can be found in Tables 4.1-4.3, the uncertainties in the table are just the statistical uncertainties of data and MC selection efficiency propagated appropriately. Tables show Scale factors (SF) and statistical uncertainties of the HT trigger for 2016HIPM (Table 4.1) , 2017 (Table 4.2) and 2018 (Table 4.3). +The fit result is used to determine the threshold at which the $H_T$ trigger is expected to reach 99\% of their plateau value. This is also to assist in the termination of the offline $H_T$ cut applied to signal event selection, to make sure that signal events are not impacted too much by the trigger turn-on effects. \Cref{fig:HT_efficiencies} shows the trigger efficiency as a function of event $H_T$ evaluated in the 4 data collection eras using the \textit{JetHT} data stream compared with QCD simulation along with an estimate of the trigger plateau value. +More specifically, \cref{fig:HT_eff_16,fig:HT_eff_16_HIPM,fig:HT_eff_17,fig:HT_eff_18} compare efficiency for \HT trigger as a function of event \HT measured relative to \verb|HLT_Mu50_v*| in data (black) and QCD MC (gray) and fit the algebraic function \textit{f} (line). With the computation of the efficiency at each range of \HT, we can compute the ratio between the \HT in data and MC. The ratio of the trigger efficiency in data vs. that in QCD MC is applied to each signal MC event as an $H_T$-dependent scaling factor, and the difference in the event acceptance of applying the scale factor and applying the scale factors with a shifted statistical uncertainty is treated as its systematic uncertainty. + +\begin{figure} + \centering + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/16_efficiency_withratio_and_fits.pdf} + \caption{Run2 2016} + \label{fig:HT_eff_16} + \end{subfigure} + % + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/16-APV-HIPM_efficiency_withratio_and_fits.pdf} + \caption{Run2 2016 HIPM} + \label{fig:HT_eff_16_HIPM} + \end{subfigure} + + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/17_efficiency_withratio_and_fits.pdf} + \caption{Run2 2017} + \label{fig:HT_eff_17} + \end{subfigure} + % + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/18_efficiency_withratio_and_fits.pdf} + \caption{Run2 2018} + \label{fig:HT_eff_18} + \end{subfigure} + \caption[Comparison of trigger efficiencies for \HT trigger]{Comparison of efficiency for \HT trigger as a function of event \HT measured relative to \texttt{HLT\_Mu50\_v*} in data (black) and QCD MC (gray) and fit to the algebraic function \textit{f} (line).} + \label{fig:HT_efficiencies} +\end{figure} + +The scale factor values used for signal MC can be found in \cref{tab:2016_triggerSF,tab:2016HIPM_triggerSF,tab:2017_triggerSF,tab:2018_triggerSF}. The uncertainties in the table are just the statistical uncertainties of data and MC selection efficiency propagated appropriately. + + +\begin{table} + \centering + \caption{Scale factors (SF) and statistical uncertainties of the \HT trigger for 2016.} + \label{tab:2016_triggerSF} + \begin{tabular}{cccc} + \hline + \HT bin[GeV] & SF & uncertainty (upper) & uncertainty (lower) \\ + \hline + 1200.0 - 1375.0 & 0.987578 & 0.001114 & 0.001227 \\ + 1375.0 - 1500.0 & 0.993525 & 0.001043 & 0.001235 \\ + 1500.0 - 1625.0 & 0.998121 & 0.000693 & 0.001015 \\ + 1625.0 - 1750.0 & 0.998326 & 0.000801 & 0.001325 \\ + 1750.0 - 1875.0 & 1.001928 & 0.000708 & 0.001671 \\ + 1875.0 - 2000.0 & 0.998974 & 0.000848 & 0.002358 \\ + 2000.0 - 2125.0 & 1.006414 & 0.000855 & 0.002898 \\ + 2125.0 - 2250.0 & 0.997732 & 0.001876 & 0.005197 \\ + 2250.0 - 2375.0 & 1.000000 & 0.000000 & 0.006580 \\ + 2375.0 - 2500.0 & 1.000000 & 0.000000 & 0.009449 \\ + \hline + \end{tabular} +\end{table} + +\begin{table} + \centering + \caption{Scale factors (SF) and statistical uncertainties of the \HT trigger for 2016HIPM.} + \label{tab:2016HIPM_triggerSF} + \begin{tabular}{cccc} + \hline + \HT bin[GeV] & SF & uncertainty (upper) & uncertainty (lower) \\ + \hline + 1200.0 - 1375.0 & 0.995561 & 0.000618 & 0.000723 \\ + 1375.0 - 1500.0 & 0.997767 & 0.000536 & 0.000697 \\ + 1500.0 - 1625.0 & 0.999147 & 0.000408 & 0.000682 \\ + 1625.0 - 1750.0 & 0.998692 & 0.000626 & 0.001037 \\ + 1750.0 - 1875.0 & 0.999505 & 0.000410 & 0.001143 \\ + 1875.0 - 2000.0 & 1.000000 & 0.000000 & 0.001484 \\ + 2000.0 - 2125.0 & 0.996381 & 0.001969 & 0.003510 \\ + 2125.0 - 2250.0 & 1.000000 & 0.000000 & 0.003497 \\ + 2250.0 - 2375.0 & 0.997389 & 0.002160 & 0.005981 \\ + 2375.0 - 2500.0 & 1.000000 & 0.000000 & 0.006582 \\ + \hline + \end{tabular} +\end{table} + +\begin{table} + \centering + \caption{Scale factors and statistical uncertainties of the \HT trigger for 2017.} + \label{tab:2017_triggerSF} + \begin{tabular}{cccc} + \hline + \HT bin[GeV] & SF & uncertainty (upper) & uncertainty (lower) \\ + \hline + 1200.0 - 1375.0 & 0.965681 & 0.001357 & 0.001412 \\ + 1375.0 - 1500.0 & 0.980217 & 0.001125 & 0.001199 \\ + 1500.0 - 1625.0 & 0.985178 & 0.001111 & 0.001201 \\ + 1625.0 - 1750.0 & 0.989094 & 0.001191 & 0.001328 \\ + 1750.0 - 1875.0 & 0.995385 & 0.000992 & 0.001215 \\ + 1875.0 - 2000.0 & 0.998218 & 0.000735 & 0.001111 \\ + 2000.0 - 2125.0 & 0.998595 & 0.001134 & 0.001635 \\ + 2125.0 - 2250.0 & 1.000000 & 0.000000 & 0.001187 \\ + 2250.0 - 2375.0 & 1.000000 & 0.000000 & 0.001821 \\ + 2375.0 - 2500.0 & 1.000277 & 0.000166 & 0.002397 \\ + 2500.0 - 2625.0 & 1.005464 & 0.001036 & 0.003895 \\ + \hline + \end{tabular} +\end{table} + +\begin{table} + \centering + \caption{Scale factors and statistical uncertainties of the \HT trigger for 2018.} + \label{tab:2018_triggerSF} + \begin{tabular}{cccc} + \hline + \HT bin[GeV] & SF & uncertainty (upper) & uncertainty (lower) \\ + \hline + 1200.0 - 1375.0 & 0.947967 & 0.001657 & 0.001723 \\ + 1375.0 - 1500.0 & 0.973940 & 0.001332 & 0.001409 \\ + 1500.0 - 1625.0 & 0.983915 & 0.001286 & 0.001392 \\ + 1625.0 - 1750.0 & 0.985506 & 0.001518 & 0.001685 \\ + 1750.0 - 1875.0 & 0.993555 & 0.001249 & 0.001515 \\ + 1875.0 - 2000.0 & 0.997541 & 0.001054 & 0.001498 \\ + 2000.0 - 2125.0 & 0.997264 & 0.001403 & 0.002103 \\ + 2125.0 - 2250.0 & 1.000000 & 0.000000 & 0.001596 \\ + 2250.0 - 2375.0 & 0.997558 & 0.001577 & 0.003217 \\ + 2375.0 - 2500.0 & 0.998141 & 0.001538 & 0.004268 \\ + \hline + \end{tabular} +\end{table} + +For completeness we also checked the trigger efficiencies in a region of phase-space were we expect to be signal-free (i.e. SinglePhoton data stream). We see in a similar fashion the plots for these unprescaled triggers divided by year in \cref{fig:HT_eff_SinglePhoton_16,fig:HT_eff_SinglePhoton_16_HIPM,fig:HT_eff_SinglePhoton_17,fig:HT_eff_SinglePhoton_18}. +More detailed plots of the fit functions around the turn-on region can be found in \cref{fig:fits} +that show the $H_T$ trigger efficiencies in each fit. + + + +\begin{figure} + \centering + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/16_SinglePhoton_efficiency_withratio_and_fits.pdf} + \caption{Run2 SinglePhoton 2016} + \label{fig:HT_eff_SinglePhoton_16} + \end{subfigure} + % + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/16-APV-HIPM_SinglePhoton_efficiency_withratio_and_fits.pdf} + \caption{Run2 SinglePhoton 2016 HIPM} + \label{fig:HT_eff_SinglePhoton_16_HIPM} + \end{subfigure} -A detailed plot of the function fits around the turn on region can be found in Figures 4.2 - 4.5 that show the HT trigger efficiencies evaluated in 2016 data (Figure 4.2), 2016HIPM data (Figure 4.3), 2017 data (Figure 4.4) and 2018 data (Figure 4.5) and in each fits to the error function and the algebraic function. + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/17_SinglePhoton_efficiency_withratio_and_fits.pdf} + \caption{Run2 SinglePhoton 2017} + \label{fig:HT_eff_SinglePhoton_17} + \end{subfigure} + % + \begin{subfigure}{.45\textwidth} + \includegraphics[width=\linewidth]{Images/pdfs/18_SinglePhoton_efficiency_withratio_and_fits.pdf} + \caption{Run2 SinglePhoton 2018} + \label{fig:HT_eff_SinglePhoton_18} + \end{subfigure} + \caption[Comparison of trigger efficiencies for \HT trigger for the signal free region.]{Comparison of efficiency for $p_T$ trigger as a function of jet $p_T$ for the SinglePhoton data stream. Measured relative to \texttt{HLT\_Mu50\_v*} in data (black) and QCD MC (gray) and fit to the algebraic function \textit{f} (line).} +\end{figure} +\begin{figure} + \centering + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_16.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure}% + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_closeup_16.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure} + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_16-APV-HIPM.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure}% + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_closeup_16-APV-HIPM.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure} + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_17.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure}% + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_closeup_17.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure} + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_18.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure}% + \begin{subfigure}{.36\linewidth} + \includegraphics[width=\linewidth]{Images/pdfs/fits_closeup_18.pdf} + % \caption{Caption} + % \label{fig:enter-label} + \end{subfigure} + \caption{More detailed plots that show information about the goodness-of-fit, namely de $\chi^2$ statistic.} + \label{fig:fits} +\end{figure} -Trigger efficiency: As shown in Fig 73, the trigger selection efficiency is different between data -and MC simulation. The ratio of the trigger efficiency in data vs. that in QCD MC is applied -to each signal MC event as a $H_T$-dependent scaling factor, and the difference in the event acceptance of applying the scale factor and applying the scale factors with a shifted statistical -uncertainty is treated as its systematic uncertainty. +\clearpage diff --git a/chapters/Chapter05.tex b/chapters/Chapter05.tex index e59031a..79da0da 100644 --- a/chapters/Chapter05.tex +++ b/chapters/Chapter05.tex @@ -1,9 +1,134 @@ -\chapter{Machine Learning for Tracker Data Quality Monitoring (DQM) \label{ch:DQM}} -DQM is for a high-quality data where increasing data size (10 times more Pixel readout channels for Inner Tracker alone during HL-LHC) is creating a huge challenge for shifters to monitor and certify it. Use of ML techniques in DQM can aid shifters detect data anomalies as detector conditions change. Currently, a shifter assesses data quality by a manual scrutiny - comparing histograms with reference ones, looking for any deviations. -We are exploring ML Playground (MLP), a Django-based framework, to automate DQM. It groups training dataset information, automates ML training, and generates reports based on the performance of ML model. -Fidalgo [31] developed a python code to extract information from Run Registry (RR) on Good/Bad runs. RR along with (1) DQM graphical interface (for 1D and 2D histograms), (2) OMS (Online Monitoring System, for beam conditions like luminosity, pile-up, trigger info) and (3) TkMaps (Tracker Maps for information organized geometrically to visualize issues and data trends in detector components) provides input to the ML framework. Fig. 6 shows a cronjob script by Fidalgo to execute DAS queries and copy files to any area with password-less certificate. +\chapter{Machine Learning for Tracker Data Quality Monitoring (ML4DQM) \label{ch:DQM}} + +\section{DQM Workflows} +Physicists at CERN use the world’s largest and most complex machinery to probe the fundamental forces of nature. +The idea of Data Quality Monitoring (DQM) is simple, monitor the data flow from the Compact Muon Solenoid (CMS) detector and certify that it is useful for physics analysis. +To operate a sophisticated and complex apparatus as CMS, quick online feedback on the quality of the data recorded is needed to avoid taking low-quality data and to guarantee a good baseline for the offline analysis. Collecting good datasets from the collisions is an important step toward the search for ``new physics" as an overwhelming amount of new data poses an extra challenge of processing and storage. +This makes it all the more important to design algorithms and special software to control the data quality. Hence, DQM plays a critical role in the maintainability of the experiment, the operation efficiency, and it also performs reliable data certification\cite{refId0}. + + +The current paradigm (see \cref{fig:DQM_workflow}) of DQM has the human shifter in mind. A person spends hours looking at various histograms that indicate the status of the data flow, detector conditions, and other activities. These histograms are checked manually and visually against a reference to help determine the data quality and detector performance. +\begin{figure} + \centering + \includegraphics[width=.9\linewidth]{Images/DQM Workflow.png} + \caption{The DQM workflow} + \label{fig:DQM_workflow} +\end{figure} +The DQM workflow consists of 2 types: Online and Offline. +The Online DQM consists of receiving data taken from the event and trigger histograms to produce results in the form of monitoring elements like histogram references and quality reports. This live monitoring of each detector’s status during data taking gives the online crew the possibility to identify problems with extremely low latency, minimizing the amount of data that would otherwise be unsuitable for physics analysis. The scrutiny of the Online DQM is a 24/7 job that consists of shifters that work at the CMS control center constantly monitoring the hundreds of different plots and histograms produced by the DQM software. This consumes a lot of manpower and is strenuous work. +Offline DQM is more focused on the full statistics over the entire run of the experiment and works with bookkeeping and data certification. In the offline environment, the system is used to review the results of the final data reconstruction on a run-by-run basis, serving as the foundation for certified data used across the CMS collaboration in all physics analyses. + + +\section{DQM Tools} +The DQM tools allow shifters to gather the information necessary for performing data certification. The platform used to certify any run's worth of data is the \textit{Certification Helper} (see \cref{fig:certhelper}). This is a web application that provides shifters with information about the runs that have or have not been certified. +Inside this tool, one can view (as shown in \cref{fig:certhelper-portal,fig:certhelper-cert}) the list of runs certified, a portal to select runs for certification, and a page with a checklist to certify the run. + +\begin{figure} + \centering + \includegraphics[width=.75\linewidth]{Images/certhelper-menu.png} + \caption{\textit{Certification Helper} is a web app that allows shifters to view, certify, and gather information on a given run.} + \label{fig:certhelper} +\end{figure} + +\begin{figure} + \centering + + \begin{subfigure}{.7\linewidth} + \includegraphics*[width=\linewidth,trim= 4in 1.5in 4in 0]{Images/certhelper-portal.png} + \end{subfigure} + + \begin{subfigure}{\linewidth} + \includegraphics[width=1\linewidth]{Images/certhelper-list.png} + \end{subfigure} + \caption{The \textit{Certification Helper} portal that allows shifters to select and view which runs are available to certify.} + \label{fig:certhelper-portal} +\end{figure} +\begin{figure} + \centering + \includegraphics[width=1\linewidth]{Images/certhelp-cert.png} + \caption{The view when certifying a run on \textit{Certification Helper}} + \label{fig:certhelper-cert} +\end{figure} + + +Other DQM tools that are frequently used are the \textit{DQM GUI}, \textit{Run Registry}, and \textit{Online Monitoring System} (OMS). We can see snippets of those tools in \cref{fig:dqmgui,fig:RR,fig:OMS}. + + +\begin{figure} + \includegraphics*[width=\linewidth,trim= 0 7in 1in 0 ]{Images/DQM GUI.png} + \caption{The DQM GUI shows many histograms that shifters use to determine the quality of a run.} + \label{fig:dqmgui} +\end{figure} + +\begin{figure} + \includegraphics*[width=\linewidth,trim= 2.9in 4.4in 0 0in]{Images/RR.png} + \caption{Run Registry. This page is a database that shows the datasets where each run is classified to and also shows it's DQM certification.} + \label{fig:RR} +\end{figure} + + +\begin{figure} + \includegraphics*[width=1\linewidth,trim = .8in 1.1in .9in 2.19in]{Images/OMS.png} + \caption{OMS webpage. This shows detector and data taking conditions and statistics} + \label{fig:OMS} +\end{figure} + + +\section{Challenges of DQM} + +The current DQM process presents many challenges that need to be addressed: +\begin{itemize} + \item The process ultimately depends on the decisions made by during DQM shifts. As time passes shifters must be trained regularly to learn the DQM process and each particular subsystem's specific metrics. The dependence of human shifters leaves the process vulnerable to unforeseen outside influences (such as getting sick, a pandemic, lack of worker availability, etc.). This allows for unpredictable mistakes and biases in the monitoring and certification workflows. + + \item Worsening this, the amount of histograms to be checked is on the order of 50-100 and many have unique metrics to define what is considered nominal. People can make mistakes and miss errors even with a dedicated team of experts for each subsystem to guide the shifters. + + \item The detector is subject to transient problems that can be overlooked during visual inspection of the monitoring elements. \cite{ML4DQM} + + \item Detector conditions can change drastically enough to require a change in the selection of the reference material. Shift experts also determine this. + + \item A lot of documentation can be found to learn about the DQM procedure. But again, this needs to be kept up-to-date manually. Sometimes this can be outdated information affecting shifter decisions. + + \item The current workflow certifies data on a run-by-run basis (i.e. run granularity). This is especially relevant with the upcoming HL-LHC, where detector conditions will allow for more data to be collected per unit of time. +\end{itemize} + +To address some of these issues, recent campaigns have emerged to reduce the manpower that both Online and Offline DQM entail. With the development of Machine Learning techniques, the hope is to automate the DQM scrutiny to a point where shifters can give better quality certifications more efficiently. We also desire to use ML to sift through data with more time using shorter time intervals known as the luminosity section (LS) granularity. +Each run is divided into LSs, an interval corresponding to a fixed number of proton-beam orbits in the LHC and amounting to approximately 23 seconds. +This finer granularity increases the number of histograms to be monitored by 1 or 2 orders of magnitude, making it unfeasible for human certification\cite{ML4DQM}. The work presented here tries to follow in this direction in order to help improve the effort. + +\section{Reference Run Ranking (non-ML)} + +Part of the work done to enable tools that allow for ML-enabled DQM workflows, was a project to develop a ranking system of the runs to be used as references. Reference runs are typically characterized to be nominal due to the data-taking conditions that were present in them. Also, the reference has to be long enough (i.e. have lots of LSs) to gather meaningful statistics. Currently, an expert shifter would decide based on data-taking conditions and the feedback gathered from DQM shifters if and when a new run should be considered to be used as a reference. There are a number of quality checks that this reference run candidate must pass but it is ultimately decided by the shift leader if it becomes the next reference. To help shift leaders, I have started work on a ranking system that ultimately provides shift leaders with a table of potential candidates. The ranking system looks at conditions like length of the run, how old is the run, and the amount of pileup collisions per event. It then passes these comparisons through a grading metric and returns a table with the results as seen in \cref{fig:ranking}. +\begin{figure} + \centering + \includegraphics[width=\linewidth]{Images/ranking.png} + \caption{Reference run ranking system demo.} + \label{fig:ranking} +\end{figure} + + +\section{ML Playground} +Following the spirit to move towards an ML-enabled DQM, an effort to develop a platform where ML models can be deployed, trained, and tested for data certification has resulted in what is now called the \textit{Machine Learning Playground} (MLP). MLP is a Django-based framework, to automate DQM. It groups training dataset information, automates ML training, and generates reports based on the performance of an ML model. It is a web application that provides easy access to the most common monitoring elements that are found in the DQM GUI, as well as the data-taking conditions and information found in OMS and the RunRegistry. It also allows for simpler data exploration when interacting with it through its Python API. It is also designed to be scalable, allowing users of other subsystems to benefit from the playground by hosting all the necessary subsystem-specific information. + +In this project, I developed code to automate the data ingestion of the MLP by using a cronjob. This cronjob executes a query from another database called \textit{Data Aggregation System} (DAS), to gather lists of newly generated files continuously. The script later downloads and copies files to our CERN-based filesystem called EOS. Afterwards, the script will index the newly copied files to the MLP database and execute the MLP's parsing capabilities, allowing the MLP to read and portray the information contained inside the files. +I have added logging functionality for detailed bookkeeping in case the scripts involved fail. + + + +% from Run Registry (RR) on Good/Bad runs, and OMS for beam conditions like luminosity, pile-up, and trigger info. +% ) and (3) TkMaps (Tracker Maps for information organized geometrically to visualize issues and data trends in detector components) provides input to the ML framework. +% is for high-quality data where increasing data size (10 times more Pixel readout channels for Inner Tracker alone during HL-LHC) is creating a huge challenge for shifters to monitor and certify it. Use of ML techniques in DQM can aid shifters detect data anomalies as detector conditions change. Currently, a shifter assesses data quality by a manual scrutiny - comparing histograms with reference ones, looking for any deviations. + +\begin{figure} + \centering + \includegraphics*[width=\linewidth,trim = 1cm 5.2in 13.6in 0]{Images/MLP.png} + \caption{ML playground web app} + \label{fig:MLplayground} +\end{figure} + + +There are two future tasks: Task 1 requires me to implement robust checks of the files already present in the EOS space and attempt to copy over only newly added files to the list. +Secondly, implement a method that allows for files that are already found in the EOS to be forcibly updated or overwritten at the request of a user, if needed. -There are two future tasks: Task 1 requires upload of newly collected data from DQMIO (Data Quality Monitoring Input Output) to the ML Playground Graphical User Interface (GUI) and organizing it efficiently for ML use. It requires (1) developing scripts to upload DQMIO files to the dedicated EOS space (EOS provides fast and reliable multi-PB disk-only storage) and track the "health" of the files uploaded, (2) monitor the file system at regular intervals and discover newly uploaded files, (3) run the MLP parser on the new files, (4) implement robust checks of the files already present in the EOS space and attempt to copy over only newly added files to the list and (5) implement a method that allows for files that are already found in the EOS to be forcibly updated or overwritten at the request of a user, if needed and (6) add logging functionality for detailed bookkeeping in case scripts involved fail. This entails use of DAS (Data Aggregation System) query for files in datasets of interest, iterate through the content generated and copy it to EOS where all DQMIO files are stored for the GUI. Task 2 will develop a Reference Run Ranking (RRR) tool based on the files provided. When a target run (i.e., a run that the shift leader needs to find a reference run for, during Data Certification (DC) and a set of prior candidate runs are provided to RRR, it will rank these candidate runs from best to worst based on their suitability as reference runs. This should ease the reference run selection and, by identifying high-quality runs, serve as a resource for shifters to find robust training datasets for ML models accessible through the MLP. A reference run should meet two criteria: (1) contain a high amount of LumiSections (LS) to ensure statistical robustness and (2) the run’s data-taking conditions should be as similar as possible to those of the target run. Past efforts failed to consistently place the reference run at the top of the list of candidates due to two main problems: firstly, the parameters characterizing each run were not normalized, leading to an artificial inflation of the importance of some parameters over others and secondly, the weights assigned to each parameter were not systematically determined but were instead based on expert opinion and experience. To mitigate these issues, feature vectors would be constructed for both target and candidate runs using (1) Run level features like initial luminosity, end luminosity, change in luminosity, delivered luminosity, start time of the run, and run number and (2) LS level features like average and standard deviation of initial luminosity, average and standard deviation of the end luminosity, average and standard deviation of pileup. Weighted Euclidean distance between feature vectors of the candidate runs and the target run would be computed to gauge similarity between them. A systematic approach would be implemented to determine the appropriate weights for this metric, that will include the normalization and possible standardization of features to ensure that they are comparable irrespective of the statistical approach chosen. diff --git a/chapters/Conclusion.tex b/chapters/Conclusion.tex index 7559e42..a8ee870 100644 --- a/chapters/Conclusion.tex +++ b/chapters/Conclusion.tex @@ -1,11 +1,11 @@ \chapter{Conclusion}\label{ch:conclusion} -There are three major topics of research that were discussed in this dissertation: The simulation studies involving the counting of L1-stubs for the HL-LHC CMS Inner Tracker upgrade ({stubs}), the overall 2016 search for SUSY in the all-hadronic channel using a customized top-tagger ({AnalysisChap}) and the improvements made for the estimation of the Z$\rightarrow \nu\bar{\nu}$+ jets background using an additional control region from $\gamma$+ jets events ({estimation}). These studies were explained in detail in their respective chapters and their individual results are provided. A summary of the most important results from each study is provided in this chapter. +% There are three major topics of research that were discussed in this dissertation: The simulation studies involving the counting of L1-stubs for the HL-LHC CMS Inner Tracker upgrade ({stubs}), the overall 2016 search for SUSY in the all-hadronic channel using a customized top-tagger ({AnalysisChap}) and the improvements made for the estimation of the Z$\rightarrow \nu\bar{\nu}$+ jets background using an additional control region from $\gamma$+ jets events ({estimation}). These studies were explained in detail in their respective chapters and their individual results are provided. A summary of the most important results from each study is provided in this chapter. -\section{L1 Stub Counting for the HL-LHC CMS Tracker Upgrade} +% \section{L1 Stub Counting for the HL-LHC CMS Tracker Upgrade} -Results from this study (detailed in {stubs}) reflect the overall effects that were expected beforehand. The removal of discs from the standard pixel geometry (consisting of 8 small and 4 large discs) results in a noticeable reduction of stub hits in the upgraded CMS Outer Tracker. This effect is specially apparent if the disc that is removed is closer to the interaction point, due to the much larger volume of particles that are present in this region. Therefore, the reduction in stubs is more pronounced when a small disc is removed (as in the case of the $7s4l$ geometry) than if a large disc is removed (as in the $8s3l$ pixel geometry). The reason for this effect stems from the fact that as particles travel through the various layers of the Inner Tracker material, some of them are bound to interact with it, producing particles that did not originate from the initial proton-proton collision. The stubs produced via such processes are considered to be ``fake'' stubs. To confirm these findings, an additional study was conducted using a sample that was virtually indistinguishable from the standard pixel geometry, but with the second disc on the positive side ``turned off'' or ``dead''. The results from this study confirm the initial findings and shows that there is indeed a correlation between the average number of stubs detected in the Outer Tracker and the total amount of material present in the upgraded Inner Tracker. An important factor that needs to be taken into account when interpreting these results is the re-optimization of the disc positions after removing a disc in the different pixel geometries considered. This feature could provide a possible explanation as to why the $6s3l$ geometry, which has two less small discs than the standard geometry (and one less large one), was found to have less of an effect on the average number of stubs than the $7s4l$ geometry. +% Results from this study (detailed in {stubs}) reflect the overall effects that were expected beforehand. The removal of discs from the standard pixel geometry (consisting of 8 small and 4 large discs) results in a noticeable reduction of stub hits in the upgraded CMS Outer Tracker. This effect is specially apparent if the disc that is removed is closer to the interaction point, due to the much larger volume of particles that are present in this region. Therefore, the reduction in stubs is more pronounced when a small disc is removed (as in the case of the $7s4l$ geometry) than if a large disc is removed (as in the $8s3l$ pixel geometry). The reason for this effect stems from the fact that as particles travel through the various layers of the Inner Tracker material, some of them are bound to interact with it, producing particles that did not originate from the initial proton-proton collision. The stubs produced via such processes are considered to be ``fake'' stubs. To confirm these findings, an additional study was conducted using a sample that was virtually indistinguishable from the standard pixel geometry, but with the second disc on the positive side ``turned off'' or ``dead''. The results from this study confirm the initial findings and shows that there is indeed a correlation between the average number of stubs detected in the Outer Tracker and the total amount of material present in the upgraded Inner Tracker. An important factor that needs to be taken into account when interpreting these results is the re-optimization of the disc positions after removing a disc in the different pixel geometries considered. This feature could provide a possible explanation as to why the $6s3l$ geometry, which has two less small discs than the standard geometry (and one less large one), was found to have less of an effect on the average number of stubs than the $7s4l$ geometry. -\section{Search for SUSY in the All-Hadronic Channel} +% \section{Search for SUSY in the All-Hadronic Channel} -The analysis presented in {AnalysisChap} shows the results of a search for SUSY in the 0-lepton final state using a customized top-tagger. The data was obtained from proton-proton collisions at the CMS detector during 2016 with a total integrated luminosity of 35.9 fb$^{-1}$ at a center-of-mass energy of 13 TeV. The search was conducted by specifying 84 non-overlapping regions of phase space with varying requirements on the $N_\text{b}$, $N_\text{t}$, $p_{\text{T}}^{miss}$, $H_\text{T}$ and $m_\text{T2}$ variables ({SearchBinDef}). Several dominant and non-dominant backgrounds were identified and estimated to account for all the majority of the processes that were seen in the collected data. The estimation procedures and their respective systematic and statistical uncertainties are discussed in {backgrounds}. The total background prediction vs. data for all 84 search bins ({SearchBinResults}) shows no statistically significant deviation from the predicted SM background. The biggest sources background were shown to be the t$\bar{\text{t}}$ and W+jets processes, followed by Z($\nu\bar{\nu}$)+jets, which were seen to be dominant in regions with a high $p_\text{T}$ threshold. Meanwhile, the contributions from the QCD multijet and rare backgrounds are found to be nearly negligible in all of the 84 search bins. Exclusion limits were calculated from these results for each of the signal models used, by applying a binned likelihood fit on the data. The likelihood function was obtained for each of the 84 search regions as well as for each of the background data control samples from the product of the Poisson probability density function. Exclusion limits were placed on the top squark, gluino and LSP production cross-sections with a 95\% confidence level (CL), calculated using a modified frequentist approach with the CL$_s$ criterion and asymptotic results for the test statistic. The 95\% CL exclusion limits obtained for the T2tt model, which consists of direct top squark production, excludes top squark masses up to 1020 GeV and LSP masses up to 430 GeV. For the T1tttt model, gluino masses of up to 2040 GeV and LSP masses up to 1150 GeV are excluded, with corresponding limits of 2020 and 1150 GeV for the T1ttbb model, 2020 and 1150 GeV for the T5tttt model, and 1810 and 1100 GeV for the T5ttcc model. +% The analysis presented in {AnalysisChap} shows the results of a search for SUSY in the 0-lepton final state using a customized top-tagger. The data was obtained from proton-proton collisions at the CMS detector during 2016 with a total integrated luminosity of 35.9 fb$^{-1}$ at a center-of-mass energy of 13 TeV. The search was conducted by specifying 84 non-overlapping regions of phase space with varying requirements on the $N_\text{b}$, $N_\text{t}$, $p_{\text{T}}^{miss}$, $H_\text{T}$ and $m_\text{T2}$ variables ({SearchBinDef}). Several dominant and non-dominant backgrounds were identified and estimated to account for all the majority of the processes that were seen in the collected data. The estimation procedures and their respective systematic and statistical uncertainties are discussed in {backgrounds}. The total background prediction vs. data for all 84 search bins ({SearchBinResults}) shows no statistically significant deviation from the predicted SM background. The biggest sources background were shown to be the t$\bar{\text{t}}$ and W+jets processes, followed by Z($\nu\bar{\nu}$)+jets, which were seen to be dominant in regions with a high $p_\text{T}$ threshold. Meanwhile, the contributions from the QCD multijet and rare backgrounds are found to be nearly negligible in all of the 84 search bins. Exclusion limits were calculated from these results for each of the signal models used, by applying a binned likelihood fit on the data. The likelihood function was obtained for each of the 84 search regions as well as for each of the background data control samples from the product of the Poisson probability density function. Exclusion limits were placed on the top squark, gluino and LSP production cross-sections with a 95\% confidence level (CL), calculated using a modified frequentist approach with the CL$_s$ criterion and asymptotic results for the test statistic. The 95\% CL exclusion limits obtained for the T2tt model, which consists of direct top squark production, excludes top squark masses up to 1020 GeV and LSP masses up to 430 GeV. For the T1tttt model, gluino masses of up to 2040 GeV and LSP masses up to 1150 GeV are excluded, with corresponding limits of 2020 and 1150 GeV for the T1ttbb model, 2020 and 1150 GeV for the T5tttt model, and 1810 and 1100 GeV for the T5ttcc model. diff --git a/chapters/Introduction.tex b/chapters/Introduction.tex index 5957d7d..6f66413 100644 --- a/chapters/Introduction.tex +++ b/chapters/Introduction.tex @@ -7,6 +7,11 @@ \chapter{Introduction} The process gives physicists clues about how the particles interact and provides insights into the fundamental laws of nature. Nine\footnote{\url{https://home.cern/science/experiments}} experiments at the LHC use detectors to analyze particles produced by proton-proton collisions. The biggest of these experiments, ATLAS and CMS, are general-purpose detectors designed to study the fundamental nature of matter and fundamental forces and to look for new physics or evidence of particles that are beyond the Standard Model\footnote{\url{https://home.cern/about/physics/standard-model}}. Having two independently designed detectors is vital for cross-confirmation of any new discoveries made with minimal bias. The other two major detectors ALICE and LHCb, respectively, study a state of matter that was present just moments after the Big Bang and a preponderance of matter than antimatter. Each experiment does important research that is key to understanding the universe that surrounds and makes us. + +In particular, this work is focused on studies done for the Emerging Jets analysis and efforts on the development of tools that provide a mechanism to filter, evaluate and certify the quality of data collected in the CMS experiment. + +% Add more on the origins of the QCD-like hidden sector. Talk about the need for DQM + \Cref{ch:CMS} presents a basic description of the Large Hadron Collider and the CMS Detector. \Cref{ch:emj} presents a description and background on the Emerging Jets theory and analysis. % \Cref{ch:HLT} develops the technical aspects and usage of the HLT system in CMS. diff --git a/preamble.tex b/preamble.tex index bbfae88..faa7a13 100644 --- a/preamble.tex +++ b/preamble.tex @@ -11,6 +11,11 @@ % bindingoffset=6mm]{geometry} % for book \usepackage[letterpaper,margin=1in]{geometry} % for article \usepackage{graphicx,wrapfig} + +%Set images folder +\graphicspath{ {Images/} } + + \usepackage{bm} \usepackage{indentfirst} \usepackage{verbatim} @@ -52,10 +57,6 @@ %\bibliography{references.bib} \usepackage[nameinlink]{cleveref} -%\linenumbers - -%Set images folder -\graphicspath{ {Images/} } %Set Header and Footer for all pages \usepackage{fancyhdr} @@ -93,15 +94,30 @@ %Main Body \usepackage{lineno} -\makeatletter -\def\makeLineNumberLeft{% - \linenumberfont\llap{\hb@xt@\linenumberwidth{\LineNumber\hss}\hskip\linenumbersep}% left line number - \hskip\columnwidth% skip over a column of text - \rlap{\hskip\linenumbersep\hb@xt@\linenumberwidth{\hss\LineNumber}}\hss}% right line number -\leftlinenumbers% Re-issue [left] option -\makeatother +% \makeatletter +% \def\makeLineNumberLeft{% +% \linenumberfont\llap{\hb@xt@\linenumberwidth{\LineNumber\hss}\hskip\linenumbersep}% left line number +% \hskip\columnwidth% skip over a column of text +% \rlap{\hskip\linenumbersep\hb@xt@\linenumberwidth{\hss\LineNumber}}\hss}% right line number +% \leftlinenumbers% Re-issue [left] option +% \makeatother \linenumbers % Adding custom packages \usepackage{physics} +\usepackage{xspace} +\usepackage{graphbox} + +% Personal definitions +\newcommand{\Qdark}{\ensuremath{Q_{DK}}\xspace} +\newcommand{\pidark}{\ensuremath{\pi_{DK}}\xspace} +\newcommand{\Mdark}{\ensuremath{X_{DK}}\xspace} + +\newcommand{\dark}{ {DK} } +\newcommand{\ctau}{\ensuremath{c\tau}\xspace} +\newcommand{\dpi}{\ensuremath{\pi_\dark}\xspace} +\newcommand{\ctaudpi}{\ensuremath{\ctau_{\dpi}}\xspace} + +\newcommand{\pgroup}[1]{\ensuremath{\left(#1\right)}} +\newcommand{\HT}{\ensuremath{H_T}\xspace} diff --git a/references.bib b/references.bib index 4e5cd78..d4d2b48 100644 --- a/references.bib +++ b/references.bib @@ -1,3 +1,32 @@ + +@misc{hl-lhc, title={The HL-LHC project}, url={https://hilumilhc.web.cern.ch/content/hl-lhc-project}, journal={The HL-LHC project | High Luminosity LHC Project}} +@article{Bai_2014, + title="{Scale of dark QCD}", + volume={89}, + ISSN={1550-2368}, + url={http://dx.doi.org/10.1103/PhysRevD.89.063522}, + DOI={10.1103/physrevd.89.063522}, + number={6}, + journal={Physical Review D}, + publisher={American Physical Society (APS)}, + author={Bai, Yang and Schwaller, Pedro}, + year={2014}, + month=mar } + +@article{Renner_2018, + title={A flavoured dark sector}, + volume={2018}, + ISSN={1029-8479}, + url={http://dx.doi.org/10.1007/JHEP08(2018)052}, + DOI={10.1007/jhep08(2018)052}, + number={8}, + journal={Journal of High Energy Physics}, + publisher={Springer Science and Business Media LLC}, + author={Renner, Sophie and Schwaller, Pedro}, + year={2018}, + month=aug } + + @article{CMS:2024gxp, author = "Hayrapetyan, Aram and others", collaboration = "CMS", @@ -116,16 +145,18 @@ @article{archer2022emerging doi = {10.1007/JHEP02(2022)027} } -@article{DQM2019, - author = {{Azzolini, Virginia} and {Broen van, Besien} and {Bugelskis, Dmitrijus} and {Hreus, Tomas} and {Maeshima, Kaori} and {Javier Fernandez, Menendez} and {Norkus, Antanas} and {James Fraser, Patrick} and {Rovere, Marco} and {Andre Schneider, Marcel}}, - title = {The {Data Quality Monitoring Software for the CMS experiment at the LHC: past, present and future}}, - doi = {10.1051/epjconf/201921402003}, - url = {https://doi.org/10.1051/epjconf/201921402003}, - journal = {EPJ Web Conf.}, - year = 2019, - volume = 214, - pages = {02003} -} + +@article{refId0, + author = {{Azzolini, Virginia} and {Broen van, Besien} and {Bugelskis, Dmitrijus} and {Hreus, Tomas} and {Maeshima, Kaori} and {Javier Fernandez, Menendez} and {Norkus, Antanas} and {James Fraser, Patrick} and {Rovere, Marco} and {Andre Schneider, Marcel}}, + doi = {10.1051/epjconf/201921402003}, + journal = {EPJ Web Conf.}, + pages = {02003}, + title = {The Data Quality Monitoring Software for the CMS experiment at the LHC: past, present and future}, + url = {https://doi.org/10.1051/epjconf/201921402003}, + volume = 214, + year = 2019, + bdsk-url-1 = {https://doi.org/10.1051/epjconf/201921402003}} + @article{CMS:2016ngn, author = "Khachatryan, Vardan and others", title = "{The CMS trigger system}",