\documentclass[11pt]{article}
\usepackage[margin=1in]{geometry}
\usepackage{hyperref}
\begin{document}
\begin{center}
\LARGE
LING572 HW3 (Na\"ive Bayes)\\
Due: 11pm on Jan 30, 2020\\
\vspace{0.3in}
\end{center}
The example files are under dropbox/19-20/572/hw3/examples/.
\vspace{0.3 in}
\noindent {\bf Q1 (5 points):} Run the Mallet NB learner (i.e., the trainer's name
is NaiveBayes) with {\bf train.vectors.txt} as the training data
and {\bf test.vectors.txt} as the test data.
In your note file, write down the training accuracy and the test accuracy.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{0.4 in}
\noindent {\bf Q2 (35 points):} Write a script, {\bf build\_NB1.sh},
that implements the Multi-variate Bernoulli NB model. It builds a
NB model from the training data, classifies the training and test data,
and calculates the accuracy.
\begin{itemize}
\item The learner should treat all features as binary; that is,
the feature is considered present iff its value is nonzero.
\item The format is: {\tt build\_NB1.sh training\_data test\_data class\_prior\_delta cond\_prob\_delta model\_file sys\_output $>$ acc\_file }
\item training\_data and test\_data are the vector files in the text format
(cf. {\bf train.vectors.txt}).
\item class\_prior\_delta is the $\delta$ used in add-$\delta$ smoothing
when calculating the class prior $P(c)$;
cond\_prob\_delta is the $\delta$ used in add-$\delta$ smoothing
when calculating the
conditional probability $P(f \mid c)$.
\item model\_file stores the values of
P(c) and $P(f \mid c)$ (cf. {\bf model1}). \\
Comment lines start with ``\%''.
The line for P(c) has the format
``classname P(c) logprob'', where logprob is 10-based log of P(c). \\
The line for $P(f \mid c)$ has the format
``featname classname P(f$\mid$c) logprob'',
where logprob is 10-based log of $P(f \mid c)$.
\item sys\_output is the classification result on the training and
test data (cf. {\bf sys1}). Each line has the following format:\\
{\tt instanceName true\_class\_label c1 p1 c2 p2 ...,} \\
where $p_i=P(c_i \mid x)=\frac{P(c_i, x)}{P(x)}$.
The $(c_i, p_i)$ pairs should be sorted according to
the value of $p_i$ in descending order.
\item acc\_file shows the confusion matrix and the accuracy for
the training and the test data (cf. {\bf acc1}).
\item As always, {\bf model1}, {\bf sys1}, and {\bf acc1}
are NOT gold standard.
These files were created with a much smaller training dataset.
\end{itemize}
Run build\_NB1.sh with {\bf train.vectors.txt} as the
training data, {\bf test.vectors.txt} as the test data,
and class\_prior\_delta set to 0:
\begin{itemize}
\item Fill out Table 1 with different values of cond\_prob\_delta.
\item Store the model\_file, sys\_output
and acc\_file for the second row (when cond\_prob\_delta is 0.5) under
q2/.
\end{itemize}
\begin{table}[h]
\centering
\caption{Results of your {\bf Bernoulli} NB model}
\label{table1}
\begin{tabular}{|r|l|l|} \hline
cond\_prob\_delta & Training accuracy & Test accuracy \\ \hline
0.1 & & \\ \hline
0.5 & & \\ \hline
1.0 & & \\ \hline
%% 2.0 & & \\ \hline
\end{tabular}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{0.4 in}
\noindent {\bf Q3 (35 points):} Write a script, {\bf build\_NB2.sh},
that implements the multinomial NB model. Other than the modeling
(e.g., the features in the multinomial NB model are real-valued),
everything else (e.g., the input/output files) is the same as in Q2.
\begin{itemize}
\item Fill out Table 2.
\item Store the model\_file, sys\_output
and acc\_file for the second row (when cond\_prob\_delta is 0.5) under
q3/.
\end{itemize}
\begin{table}[h]
\centering
\caption{Results of your {\bf multinomial} NB model}
\label{table1}
\begin{tabular}{|r|l|l|} \hline
cond\_prob\_delta & Training accuracy & Test accuracy \\ \hline
0.1 & & \\ \hline
0.5 & & \\ \hline
1.0 & & \\ \hline
%% 2.0 & & \\ \hline
\end{tabular}
\end{table}
%%%%%%%%%%%%%%%%%%%%%%%%
\vspace{0.5 in}
\noindent {\bf Submission:} Submit the following to Canvas:
\begin{itemize}
\item Your note file {\it readme.(txt $\mid$ pdf)}
that includes Table 1 and 2,
and any notes that you want the TA to read.
\item hw3.tar.gz that includes all the files specified in
dropbox/19-20/572/hw3/submit-file-list, plus any source code
(and binary code) used by the shell scripts.
\item Make sure that you run {\bf check\_hw3.sh} before
submitting your hw.tar.gz.
\end{itemize}
\end{document}