https://github.com/cran/RecordLinkage
Tip revision: b32452149857b15849e9c82fb81e812df6e921fc authored by Murat Sariyar on 08 November 2022, 13:10:15 UTC
version 0.4-12.4
version 0.4-12.4
Tip revision: b324521
Supervised.rnw
%\VignetteIndexEntry{Supervised Classification}
%\VignetteEngine{knitr::knitr}
<<echo=FALSE,results='hide'>>=
knitr::opts_chunk$set(message =FALSE, warnings = FALSE)
options(width=60)
backup_options <- options()
@
\documentclass[a4paper]{article}
\usepackage[ansinew]{inputenc}
\begin{document}
<<include=FALSE>>=
library(knitr)
opts_chunk$set(
concordance=TRUE
)
@
\title{Example Session for Supervised Classification}
\author{Andreas Borg, Murat Sariyar}
\maketitle
This document shows an example session for using supervised classification
in the package \textit{RecordLinkage} for deduplication of a single data set.
Conducting linkage of two data sets differs only in the step of generating
record pairs.
See also the vignette on Fellegi-Sunter deduplication for some general
information on using the package.
\section{Generating comparison patterns}
<<results='hide',echo=FALSE>>=
library(RecordLinkage)
@
In this session, a training set with 50 matches and 250
non-matches is generated from the included data set \texttt{RLData10000}.
Record pairs from the set \texttt{RLData500} are used to calibrate and
subsequently evaluate the classifiers.
<<>>=
data(RLdata500)
data(RLdata10000)
train_pairs=compare.dedup(RLdata10000, identity=identity.RLdata10000,
n_match=500, n_non_match=500)
eval_pairs=compare.dedup(RLdata500,identity=identity.RLdata500)
@
\section{Training}
\texttt{trainSupv} handles calibration of supervised classificators which are
selected through the argument \texttt{method}. In the
following, a single decision tree (rpart), a bootstrap aggregation of decision trees
(bagging) and a support vector machine are calibrated (svm).
<<>>=
model_rpart=trainSupv(train_pairs, method="rpart")
model_bagging=trainSupv(train_pairs, method="bagging")
model_svm=trainSupv(train_pairs, method="svm")
@
\section{Classification}
\texttt{classifySupv} handles classification for all supervised classificators,
taking as arguments the structure returned by \texttt{trainSupv} which contains
the classification model and the set of record pairs which to classify.
<<>>=
result_rpart=classifySupv(model_rpart, eval_pairs)
result_bagging=classifySupv(model_bagging, eval_pairs)
result_svm=classifySupv(model_svm, eval_pairs)
@
\section{Results}
\subsection{Rpart}
<<results='asis',echo=FALSE>>=
texSummary(result_rpart)
@
\subsection{Bagging}
<<results='asis',echo=FALSE>>=
texSummary(result_bagging)
@
\subsection{SVM}
<<results='asis',echo=FALSE>>=
texSummary(result_svm)
@
<<echo=FALSE,results='hide'>>=
options(backup_options)
@
\end{document}