# author: Matthias C. M. Troffaes # date: 25 July 2018 # license: GPLv3 # helper functions for type checking ############################################################################### .stopifnotdataframe = function(x) { if (!is.data.frame(x)) stop("expected a data frame") } .stopifnottable = function(x) { if (!is.table(x)) stop("expected a table") } .stopifnotlistoftables = function(xs) { if (!is.list(xs)) stop("expected a list of tables") lapply(xs, function(x) { .stopifnottable(x) }) NULL } .stopifnotnumericvalue = function(x) { if (!is.numeric(x)) stop("expected a single numeric value") if (length(x) != 1) stop("expected a single numeric value") } .stopifnotnumericvector = function(x) { if (!is.numeric(x)) stop("expected a numeric vector") } # functions for cross fold validation # https://gist.github.com/mcmtroffaes/709908 ############################################################################### kfcv.sizes = function(n, k=10) { # generate sample sizes for k-fold cross validation on a data set of # size n # usage: # # kfcv.sizes(n, k=...) # sizes = c() for (i in 1:k) { first = 1 + (((i - 1) * n) %/% k) last = ((i * n) %/% k) sizes = append(sizes, last - first + 1) } sizes } kfcv.testing = function(n, k=10) { # generate testing sample indices for k-fold cross validation on a # data set of size n # usage: # # kfcv.testing(n, k=...) # indices = list() sizes = kfcv.sizes(n, k=k) values = 1:n for (i in 1:k) { # take a random sample of given size s = sample(values, sizes[i]) # append random sample to list of indices indices[[i]] = s # remove sample from values values = setdiff(values, s) } indices } kfcv.classifier = function(data, attribs, class, classifier, k=10) { # run k-fold cross validation with an arbitrary classifier # usage: # # kfcv.classifier(data, class, classifier, k=...) # # where data is the data frame (each column is an attribute, and # each row is an observation), class is the column index for the # attribute to be predicted # classifier$trainer is a function which takes a training set, # attribute column indices, and a class column index; it returns a # model (a data structure) that can be used by the tester function # described next # classifier$tester is a function which takes a trained model and a # single row from a test set; it returns list of numerical test # results (e.g. whether the classifier predicted correctly, utility # for misclassification, ...) colMeans( do.call( rbind.data.frame, lapply( kfcv.testing(dim(data)[1]), function(testingindices) { model = classifier$trainer(data[-testingindices,], attribs, class) do.call( rbind.data.frame, lapply( testingindices, function(rowid) { classifier$tester(model, data[rowid,]) } )) }) ), na.rm=TRUE) } # compose classifiers from a list of classifiers ############################################################################### classifier.composed = function(classifiers) { list( trainer=function(train, attribs, class) { # the model is simply a list of models, one for each classifier lapply(classifiers, function(cls) cls$trainer(train, attribs, class)) }, tester=function(model, testrow) { # apply each tester to each model do.call( c, lapply( 1:length(model), function(i) (classifiers[[i]]$tester)(model[[i]], testrow))) } ) } # trainer and tester functions for the naive Bayes classifier ############################################################################### # calculate empirical probability mass function # rows = the data frame; each column is a variable, each row is a joint # observation # colid = column of the variable of which to calculate probability mass function # s1, s2, s3 = values to change the counts (see code) .helper.empirical.prob.table = function(s1, s2, s3) { function(rows, colid) { .stopifnotdataframe(rows) counts = table(rows[colid]) prob = (counts + s1) / (sum(counts + s2) + s3) stopifnot(all(levels(factor(rows[,colid])) %in% names(prob))) prob } } .empirical.prob.table = .helper.empirical.prob.table(0, 0, 0) .laplace.empirical.prob.table = function(alpha=1) .helper.empirical.prob.table(alpha, alpha, 0) .lower.empirical.prob.table = function(s=2) .helper.empirical.prob.table(0, 0, s) .upper.empirical.prob.table = function(s=2) .helper.empirical.prob.table(s, 0, s) # calculate empirical conditional probability mass function # rows = the data frame; each column is a variable, each row is a joint # observation # colid1 = column with values of the conditioning variable # colid2 = column with values of the non-conditioning random variable # returns a table where each row is a probability mass function .helper.empirical.conditional.prob.table = function(s1, s2, s3) { function(rows, colid1, colid2) { .stopifnotdataframe(rows) counts = table(rows[c(colid1, colid2)]) (counts + s1) / (apply((counts + s2), 1, sum) + s3) } } .empirical.conditional.prob.table = .helper.empirical.conditional.prob.table(0, 0, 0) .laplace.empirical.conditional.prob.table = function(alpha=1) .helper.empirical.conditional.prob.table(alpha, alpha, 0) .lower.empirical.conditional.prob.table = function(s=2) .helper.empirical.conditional.prob.table(0, 0, s) .upper.empirical.conditional.prob.table = function(s=2) .helper.empirical.conditional.prob.table(s, 0, s) # calculate joint probability p(c,a) = p(c) * p(a1|c) * ... * p(ak|c) .classifier.naive.joint.prob = function(c.prob.table, a.prob.tables, a.colids, c.level, testrow) { # check input types and values .stopifnottable(c.prob.table) .stopifnotlistoftables(a.prob.tables) .stopifnotnumericvector(a.colids) .stopifnotdataframe(testrow) # debug: we should have one p(c|a) per attribute stopifnot(length(a.colids) == length(a.prob.tables)) # calculate p(a1|c), p(a2|c), ..., p(ak|c) p.acs = sapply( 1:length(a.colids), function(i) { colid = a.colids[i] a.level = testrow[1, colid] prob.table.a.given.c = a.prob.tables[[i]] prob.table.a.given.c[c.level, a.level] } ) # calculate p(c) p.c = c.prob.table[c.level] # joint probability under naive assumption # p(c,a)=p(c) * p(a1|c) * ... * p(ak|c) p.c * prod(p.acs) } # calculate all class probabilities from a test row .classifier.naive.joint.probs = function(c.prob.table, a.prob.tables, a.colids, testrow) { .stopifnottable(c.prob.table) .stopifnotlistoftables(a.prob.tables) .stopifnotnumericvector(a.colids) .stopifnotdataframe(testrow) # calculate p(c,a) for each class level c # (the a_i are stored in testrow, the class levels are c.levels) lapply( names(c.prob.table), function(c.level) { .classifier.naive.joint.prob(c.prob.table, a.prob.tables, a.colids, c.level, testrow) } ) } # predict the class from a test row .classifier.naive.predict = function(c.prob.table, a.prob.tables, a.colids, testrow) { .stopifnottable(c.prob.table) .stopifnotlistoftables(a.prob.tables) .stopifnotnumericvector(a.colids) .stopifnotdataframe(testrow) # calculate p(c,a) for each class level c # (the a_i are stored in testrow, the class levels are c.levels) c.levels = names(c.prob.table) p.cas = .classifier.naive.joint.probs(c.prob.table, a.prob.tables, a.colids, testrow) c.levels[which.max(p.cas)] } classifier.naive2 = function(alpha=0) { list( trainer=function(train, attribs, class) { list( attribs=attribs, class=class, c.prob.table=.laplace.empirical.prob.table(alpha)(train, class), a.prob.tables=lapply( attribs, function(attrib) { .laplace.empirical.conditional.prob.table(alpha)(train, class, attrib) }) ) }, tester=function(model, testrow) { predictedclass = .classifier.naive.predict( model$c.prob.table, model$a.prob.tables, model$attribs, testrow) list(acc=(predictedclass == testrow[1, model$class])) } ) } # a simple credal naive Bayes classifier based on interval dominance ############################################################################### classifier.credal = function(s=2) { list( trainer=function(train, attribs, class) { list( attribs=attribs, class=class, c.lprob.table=.lower.empirical.prob.table(s)(train, class), c.uprob.table=.upper.empirical.prob.table(s)(train, class), a.lprob.tables=lapply( attribs, function(attrib) { .lower.empirical.conditional.prob.table(s)(train, class, attrib) }), a.uprob.tables=lapply( attribs, function(attrib) { .upper.empirical.conditional.prob.table(s)(train, class, attrib) }) ) }, tester=function(model, testrow) { lprobs = unlist(.classifier.naive.joint.probs(model$c.lprob.table, model$a.lprob.tables, model$attribs, testrow)) uprobs = unlist(.classifier.naive.joint.probs(model$c.uprob.table, model$a.uprob.tables, model$attribs, testrow)) c.levels = names(model$c.lprob.table) result = c.levels[uprobs >= max(lprobs)] accuracy = (testrow[1, model$class] %in% result) list( acc=accuracy, deter=(length(result) == 1), singleacc=if (length(result) == 1) accuracy else NA, setsize=if (length(result) != 1) length(result) else NA, setacc=if (length(result) != 1) accuracy else NA) } ) } # reading the data ############################################################################### getdata = function() { mydata = read.csv( text=" 5,67,3,5,3,1 4,43,1,1,?,1 5,58,4,5,3,1 4,28,1,1,3,0 5,74,1,5,?,1 4,65,1,?,3,0 4,70,?,?,3,0 5,42,1,?,3,0 5,57,1,5,3,1 5,60,?,5,1,1 5,76,1,4,3,1 3,42,2,1,3,1 4,64,1,?,3,0 4,36,3,1,2,0 4,60,2,1,2,0 4,54,1,1,3,0 3,52,3,4,3,0 4,59,2,1,3,1 4,54,1,1,3,1 4,40,1,?,?,0 ?,66,?,?,1,1 5,56,4,3,1,1 4,43,1,?,?,0 5,42,4,4,3,1 4,59,2,4,3,1 5,75,4,5,3,1 2,66,1,1,?,0 5,63,3,?,3,0 5,45,4,5,3,1 5,55,4,4,3,0 4,46,1,5,2,0 5,54,4,4,3,1 5,57,4,4,3,1 4,39,1,1,2,0 4,81,1,1,3,0 4,77,3,?,?,0 4,60,2,1,3,0 5,67,3,4,2,1 4,48,4,5,?,1 4,55,3,4,2,0 4,59,2,1,?,0 4,78,1,1,1,0 4,50,1,1,3,0 4,61,2,1,?,0 5,62,3,5,2,1 5,44,2,4,?,1 5,64,4,5,3,1 4,23,1,1,?,0 2,42,?,?,4,0 5,67,4,5,3,1 4,74,2,1,2,0 5,80,3,5,3,1 4,23,1,1,?,0 4,63,2,1,?,0 4,53,?,5,3,1 4,43,3,4,?,0 4,49,2,1,1,0 5,51,2,4,?,0 4,45,2,1,?,0 5,59,2,?,?,1 5,52,4,3,3,1 5,60,4,3,3,1 4,57,2,5,3,0 3,57,2,1,?,0 5,74,4,4,3,1 4,25,2,1,?,0 4,49,1,1,3,0 5,72,4,3,?,1 4,45,2,1,3,0 4,64,2,1,3,0 4,73,2,1,2,0 5,68,4,3,3,1 5,52,4,5,3,0 5,66,4,4,3,1 5,70,?,4,?,1 4,25,1,1,3,0 5,74,1,1,2,1 4,64,1,1,3,0 5,60,4,3,2,1 5,67,2,4,1,0 4,67,4,5,3,0 5,44,4,4,2,1 3,68,1,1,3,1 4,57,?,4,1,0 5,51,4,?,?,1 4,33,1,?,?,0 5,58,4,4,3,1 5,36,1,?,?,0 4,63,1,1,?,0 5,62,1,5,3,1 4,73,3,4,3,1 4,80,4,4,3,1 4,67,1,1,?,0 5,59,2,1,3,1 5,60,1,?,3,0 5,54,4,4,3,1 4,40,1,1,?,0 4,47,2,1,?,0 5,62,4,4,3,0 4,33,2,1,3,0 5,59,2,?,?,0 4,65,2,?,?,0 4,58,4,4,?,0 4,29,2,?,?,0 4,58,1,1,?,0 4,54,1,1,?,0 4,44,1,1,?,1 3,34,2,1,?,0 4,57,1,1,3,0 5,33,4,4,?,1 4,45,4,4,3,0 5,71,4,4,3,1 5,59,4,4,2,0 4,56,2,1,?,0 4,40,3,4,?,0 4,56,1,1,3,0 4,45,2,1,?,0 4,57,2,1,2,0 5,55,3,4,3,1 5,84,4,5,3,0 5,51,4,4,3,1 4,43,1,1,?,0 4,24,2,1,2,0 4,66,1,1,3,0 5,33,4,4,3,0 4,59,4,3,2,0 4,76,2,3,?,0 4,40,1,1,?,0 4,52,?,4,?,0 5,40,4,5,3,1 5,67,4,4,3,1 5,75,4,3,3,1 5,86,4,4,3,0 4,60,2,?,?,0 5,66,4,4,3,1 5,46,4,5,3,1 4,59,4,4,3,1 5,65,4,4,3,1 4,53,1,1,3,0 5,67,3,5,3,1 5,80,4,5,3,1 4,55,2,1,3,0 4,48,1,1,?,0 4,47,1,1,2,0 4,50,2,1,?,0 5,62,4,5,3,1 5,63,4,4,3,1 4,63,4,?,3,1 4,71,4,4,3,1 4,41,1,1,3,0 5,57,4,4,4,1 5,71,4,4,4,1 4,66,1,1,3,0 4,47,2,4,2,0 3,34,4,4,3,0 4,59,3,4,3,0 5,55,2,?,?,1 4,51,?,?,3,0 4,62,2,1,?,0 4,58,4,?,3,1 5,67,4,4,3,1 4,41,2,1,3,0 4,23,3,1,3,0 4,53,?,4,3,0 4,42,2,1,3,0 5,87,4,5,3,1 4,68,1,1,3,1 4,64,1,1,3,0 5,54,3,5,3,1 5,86,4,5,3,1 4,21,2,1,3,0 4,39,1,1,?,0 4,53,4,4,3,0 4,44,4,4,3,0 4,54,1,1,3,0 5,63,4,5,3,1 4,62,2,1,?,0 4,45,2,1,2,0 5,71,4,5,3,0 5,49,4,4,3,1 4,49,4,4,3,0 5,66,4,4,4,0 4,19,1,1,3,0 4,35,1,1,2,0 4,71,3,3,?,1 5,74,4,5,3,1 5,37,4,4,3,1 4,67,1,?,3,0 5,81,3,4,3,1 5,59,4,4,3,1 4,34,1,1,3,0 5,79,4,3,3,1 5,60,3,1,3,0 4,41,1,1,3,1 4,50,1,1,3,0 5,85,4,4,3,1 4,46,1,1,3,0 5,66,4,4,3,1 4,73,3,1,2,0 4,55,1,1,3,0 4,49,2,1,3,0 3,49,4,4,3,0 4,51,4,5,3,1 2,48,4,4,3,0 4,58,4,5,3,0 5,72,4,5,3,1 4,46,2,3,3,0 4,43,4,3,3,1 ?,52,4,4,3,0 4,66,2,1,?,0 4,46,1,1,1,0 4,69,3,1,3,0 2,59,1,1,?,1 5,43,2,1,3,1 5,76,4,5,3,1 4,46,1,1,3,0 4,59,2,4,3,0 4,57,1,1,3,0 5,43,4,5,?,0 3,45,2,1,3,0 3,43,2,1,3,0 4,45,2,1,3,0 5,57,4,5,3,1 5,79,4,4,3,1 5,54,2,1,3,1 4,40,3,4,3,0 5,63,4,4,3,1 2,55,1,?,1,0 4,52,2,1,3,0 4,38,1,1,3,0 3,72,4,3,3,0 5,80,4,3,3,1 5,76,4,3,3,1 4,62,3,1,3,0 5,64,4,5,3,1 5,42,4,5,3,0 3,60,?,3,1,0 4,64,4,5,3,0 4,63,4,4,3,1 4,24,2,1,2,0 5,72,4,4,3,1 4,63,2,1,3,0 4,46,1,1,3,0 3,33,1,1,3,0 5,76,4,4,3,1 4,36,2,3,3,0 4,40,2,1,3,0 5,58,1,5,3,1 4,43,2,1,3,0 3,42,1,1,3,0 4,32,1,1,3,0 5,57,4,4,2,1 4,37,1,1,3,0 4,70,4,4,3,1 5,56,4,2,3,1 3,76,?,3,2,0 5,73,4,4,3,1 5,77,4,5,3,1 5,67,4,4,1,1 5,71,4,3,3,1 5,65,4,4,3,1 4,43,1,1,3,0 4,40,2,1,?,0 4,49,2,1,3,0 5,76,4,2,3,1 4,55,4,4,3,0 5,72,4,5,3,1 3,53,4,3,3,0 5,75,4,4,3,1 5,61,4,5,3,1 5,67,4,4,3,1 5,55,4,2,3,1 5,66,4,4,3,1 2,76,1,1,2,0 4,57,4,4,3,1 5,71,3,1,3,0 5,70,4,5,3,1 4,35,4,2,?,0 5,79,1,?,3,1 4,63,2,1,3,0 5,40,1,4,3,1 4,41,1,1,3,0 4,47,2,1,2,0 4,68,1,1,3,1 4,64,4,3,3,1 4,65,4,4,?,1 4,73,4,3,3,0 4,39,4,3,3,0 5,55,4,5,4,1 5,53,3,4,4,0 5,66,4,4,3,1 4,43,3,1,2,0 5,44,4,5,3,1 4,77,4,4,3,1 4,62,2,4,3,0 5,80,4,4,3,1 4,33,4,4,3,0 4,50,4,5,3,1 4,71,1,?,3,0 5,46,4,4,3,1 5,49,4,5,3,1 4,53,1,1,3,0 3,46,2,1,2,0 4,57,1,1,3,0 4,54,3,1,3,0 4,54,1,?,?,0 2,49,2,1,2,0 4,47,3,1,3,0 4,40,1,1,3,0 4,45,1,1,3,0 4,50,4,5,3,1 5,54,4,4,3,1 4,67,4,1,3,1 4,77,4,4,3,1 4,66,4,3,3,0 4,71,2,?,3,1 4,36,2,3,3,0 4,69,4,4,3,0 4,48,1,1,3,0 4,64,4,4,3,1 4,71,4,2,3,1 5,60,4,3,3,1 4,24,1,1,3,0 5,34,4,5,2,1 4,79,1,1,2,0 4,45,1,1,3,0 4,37,2,1,2,0 4,42,1,1,2,0 4,72,4,4,3,1 5,60,4,5,3,1 5,85,3,5,3,1 4,51,1,1,3,0 5,54,4,5,3,1 5,55,4,3,3,1 4,64,4,4,3,0 5,67,4,5,3,1 5,75,4,3,3,1 5,87,4,4,3,1 4,46,4,4,3,1 4,59,2,1,?,0 55,46,4,3,3,1 5,61,1,1,3,1 4,44,1,4,3,0 4,32,1,1,3,0 4,62,1,1,3,0 5,59,4,5,3,1 4,61,4,1,3,0 5,78,4,4,3,1 5,42,4,5,3,0 4,45,1,2,3,0 5,34,2,1,3,1 5,39,4,3,?,1 4,27,3,1,3,0 4,43,1,1,3,0 5,83,4,4,3,1 4,36,2,1,3,0 4,37,2,1,3,0 4,56,3,1,3,1 5,55,4,4,3,1 5,46,3,?,3,0 4,88,4,4,3,1 5,71,4,4,3,1 4,41,2,1,3,0 5,49,4,4,3,1 3,51,1,1,4,0 4,39,1,3,3,0 4,46,2,1,3,0 5,52,4,4,3,1 5,58,4,4,3,1 4,67,4,5,3,1 5,80,4,4,3,1 3,46,1,?,?,0 3,43,1,?,?,0 4,45,1,1,3,0 5,68,4,4,3,1 4,54,4,4,?,1 4,44,2,3,3,0 5,74,4,3,3,1 5,55,4,5,3,0 4,49,4,4,3,1 4,49,1,1,3,0 5,50,4,3,3,1 5,52,3,5,3,1 4,45,1,1,3,0 4,66,1,1,3,0 4,68,4,4,3,1 4,72,2,1,3,0 5,64,?,?,3,0 2,49,?,3,3,0 3,44,?,4,3,0 5,74,4,4,3,1 5,58,4,4,3,1 4,77,2,3,3,0 4,49,3,1,3,0 4,34,?,?,4,0 5,60,4,3,3,1 5,69,4,3,3,1 4,53,2,1,3,0 3,46,3,4,3,0 5,74,4,4,3,1 4,58,1,1,3,0 5,68,4,4,3,1 5,46,4,3,3,0 5,61,2,4,3,1 5,70,4,3,3,1 5,37,4,4,3,1 3,65,4,5,3,1 4,67,4,4,3,0 5,69,3,4,3,0 5,76,4,4,3,1 4,65,4,3,3,0 5,72,4,2,3,1 4,62,4,2,3,0 5,42,4,4,3,1 5,66,4,3,3,1 5,48,4,4,3,1 4,35,1,1,3,0 5,60,4,4,3,1 5,67,4,2,3,1 5,78,4,4,3,1 4,66,1,1,3,1 4,26,1,1,?,0 4,48,1,1,3,0 4,31,1,1,3,0 5,43,4,3,3,1 5,72,2,4,3,0 5,66,1,1,3,1 4,56,4,4,3,0 5,58,4,5,3,1 5,33,2,4,3,1 4,37,1,1,3,0 5,36,4,3,3,1 4,39,2,3,3,0 4,39,4,4,3,1 5,83,4,4,3,1 4,68,4,5,3,1 5,63,3,4,3,1 5,78,4,4,3,1 4,38,2,3,3,0 5,46,4,3,3,1 5,60,4,4,3,1 5,56,2,3,3,1 4,33,1,1,3,0 4,?,4,5,3,1 4,69,1,5,3,1 5,66,1,4,3,1 4,72,1,3,3,0 4,29,1,1,3,0 5,54,4,5,3,1 5,80,4,4,3,1 5,68,4,3,3,1 4,35,2,1,3,0 4,57,3,?,3,0 5,?,4,4,3,1 4,50,1,1,3,0 4,32,4,3,3,0 0,69,4,5,3,1 4,71,4,5,3,1 5,87,4,5,3,1 3,40,2,?,3,0 4,31,1,1,?,0 4,64,1,1,3,0 5,55,4,5,3,1 4,18,1,1,3,0 3,50,2,1,?,0 4,53,1,1,3,0 5,84,4,5,3,1 5,80,4,3,3,1 4,32,1,1,3,0 5,77,3,4,3,1 4,38,1,1,3,0 5,54,4,5,3,1 4,63,1,1,3,0 4,61,1,1,3,0 4,52,1,1,3,0 4,36,1,1,3,0 4,41,?,?,3,0 4,59,1,1,3,0 5,51,4,4,2,1 4,36,1,1,3,0 5,40,4,3,3,1 4,49,1,1,3,0 4,37,2,3,3,0 4,46,1,1,3,0 4,63,1,1,3,0 4,28,2,1,3,0 4,47,2,1,3,0 4,42,2,1,3,1 5,44,4,5,3,1 4,49,4,4,3,0 5,47,4,5,3,1 5,52,4,5,3,1 4,53,1,1,3,1 5,83,3,3,3,1 4,50,4,4,?,1 5,63,4,4,3,1 4,82,?,5,3,1 4,54,1,1,3,0 4,50,4,4,3,0 5,80,4,5,3,1 5,45,2,4,3,0 5,59,4,4,?,1 4,28,2,1,3,0 4,31,1,1,3,0 4,41,2,1,3,0 4,21,3,1,3,0 5,44,3,4,3,1 5,49,4,4,3,1 5,71,4,5,3,1 5,75,4,5,3,1 4,38,2,1,3,0 4,60,1,3,3,0 5,87,4,5,3,1 4,70,4,4,3,1 5,55,4,5,3,1 3,21,1,1,3,0 4,50,1,1,3,0 5,76,4,5,3,1 4,23,1,1,3,0 3,68,?,?,3,0 4,62,4,?,3,1 5,65,1,?,3,1 5,73,4,5,3,1 4,38,2,3,3,0 2,57,1,1,3,0 5,65,4,5,3,1 5,67,2,4,3,1 5,61,2,4,3,1 5,56,4,4,3,0 5,71,2,4,3,1 4,49,2,2,3,0 4,55,?,?,3,0 4,44,2,1,3,0 0,58,4,4,3,0 4,27,2,1,3,0 5,73,4,5,3,1 4,34,2,1,3,0 5,63,?,4,3,1 4,50,2,1,3,1 4,62,2,1,3,0 3,21,3,1,3,0 4,49,2,?,3,0 4,36,3,1,3,0 4,45,2,1,3,1 5,67,4,5,3,1 4,21,1,1,3,0 4,57,2,1,3,0 5,66,4,5,3,1 4,71,4,4,3,1 5,69,3,4,3,1 6,80,4,5,3,1 3,27,2,1,3,0 4,38,2,1,3,0 4,23,2,1,3,0 5,70,?,5,3,1 4,46,4,3,3,0 4,61,2,3,3,0 5,65,4,5,3,1 4,60,4,3,3,0 5,83,4,5,3,1 5,40,4,4,3,1 2,59,?,4,3,0 4,53,3,4,3,0 4,76,4,4,3,0 5,79,1,4,3,1 5,38,2,4,3,1 4,61,3,4,3,0 4,56,2,1,3,0 4,44,2,1,3,0 4,64,3,4,?,1 4,66,3,3,3,0 4,50,3,3,3,0 4,46,1,1,3,0 4,39,1,1,3,0 4,60,3,?,?,0 5,55,4,5,3,1 4,40,2,1,3,0 4,26,1,1,3,0 5,84,3,2,3,1 4,41,2,2,3,0 4,63,1,1,3,0 2,65,?,1,2,0 4,49,1,1,3,0 4,56,2,2,3,1 5,65,4,4,3,0 4,54,1,1,3,0 4,36,1,1,3,0 5,49,4,4,3,0 4,59,4,4,3,1 5,75,4,4,3,1 5,59,4,2,3,0 5,59,4,4,3,1 4,28,4,4,3,1 5,53,4,5,3,0 5,57,4,4,3,0 5,77,4,3,4,0 5,85,4,3,3,1 4,59,4,4,3,0 5,59,1,5,3,1 4,65,3,3,3,1 4,54,2,1,3,0 5,46,4,5,3,1 4,63,4,4,3,1 4,53,1,1,3,1 4,56,1,1,3,0 5,66,4,4,3,1 5,66,4,5,3,1 4,55,1,1,3,0 4,44,1,1,3,0 5,86,3,4,3,1 5,47,4,5,3,1 5,59,4,5,3,1 5,66,4,5,3,0 5,61,4,3,3,1 3,46,?,5,?,1 4,69,1,1,3,0 5,93,1,5,3,1 4,39,1,3,3,0 5,44,4,5,3,1 4,45,2,2,3,0 4,51,3,4,3,0 4,56,2,4,3,0 4,66,4,4,3,0 5,61,4,5,3,1 4,64,3,3,3,1 5,57,2,4,3,0 5,79,4,4,3,1 4,57,2,1,?,0 4,44,4,1,1,0 4,31,2,1,3,0 4,63,4,4,3,0 4,64,1,1,3,0 5,47,4,5,3,0 5,68,4,5,3,1 4,30,1,1,3,0 5,43,4,5,3,1 4,56,1,1,3,0 4,46,2,1,3,0 4,67,2,1,3,0 5,52,4,5,3,1 4,67,4,4,3,1 4,47,2,1,3,0 5,58,4,5,3,1 4,28,2,1,3,0 4,43,1,1,3,0 4,57,2,4,3,0 5,68,4,5,3,1 4,64,2,4,3,0 4,64,2,4,3,0 5,62,4,4,3,1 4,38,4,1,3,0 5,68,4,4,3,1 4,41,2,1,3,0 4,35,2,1,3,1 4,68,2,1,3,0 5,55,4,4,3,1 5,67,4,4,3,1 4,51,4,3,3,0 2,40,1,1,3,0 5,73,4,4,3,1 4,58,?,4,3,1 4,51,?,4,3,0 3,50,?,?,3,1 5,59,4,3,3,1 6,60,3,5,3,1 4,27,2,1,?,0 5,54,4,3,3,0 4,56,1,1,3,0 5,53,4,5,3,1 4,54,2,4,3,0 5,79,1,4,3,1 5,67,4,3,3,1 5,64,3,3,3,1 4,70,1,2,3,1 5,55,4,3,3,1 5,65,3,3,3,1 5,45,4,2,3,1 4,57,4,4,?,1 5,49,1,1,3,1 4,24,2,1,3,0 4,52,1,1,3,0 4,50,2,1,3,0 4,35,1,1,3,0 5,?,3,3,3,1 5,64,4,3,3,1 5,40,4,1,1,1 5,66,4,4,3,1 4,64,4,4,3,1 5,52,4,3,3,1 5,43,1,4,3,1 4,56,4,4,3,0 4,72,3,?,3,0 6,51,4,4,3,1 4,79,4,4,3,1 4,22,2,1,3,0 4,73,2,1,3,0 4,53,3,4,3,0 4,59,2,1,3,1 4,46,4,4,2,0 5,66,4,4,3,1 4,50,4,3,3,1 4,58,1,1,3,1 4,55,1,1,3,0 4,62,2,4,3,1 4,60,1,1,3,0 5,57,4,3,3,1 4,57,1,1,3,0 6,41,2,1,3,0 4,71,2,1,3,1 4,32,2,1,3,0 4,57,2,1,3,0 4,19,1,1,3,0 4,62,2,4,3,1 5,67,4,5,3,1 4,50,4,5,3,0 4,65,2,3,2,0 4,40,2,4,2,0 6,71,4,4,3,1 6,68,4,3,3,1 4,68,1,1,3,0 4,29,1,1,3,0 4,53,2,1,3,0 5,66,4,4,3,1 4,60,3,?,4,0 5,76,4,4,3,1 4,58,2,1,2,0 5,96,3,4,3,1 5,70,4,4,3,1 4,34,2,1,3,0 4,59,2,1,3,0 4,45,3,1,3,1 5,65,4,4,3,1 4,59,1,1,3,0 4,21,2,1,3,0 3,43,2,1,3,0 4,53,1,1,3,0 4,65,2,1,3,0 4,64,2,4,3,1 4,53,4,4,3,0 4,51,1,1,3,0 4,59,2,4,3,0 4,56,2,1,3,0 4,60,2,1,3,0 4,22,1,1,3,0 4,25,2,1,3,0 6,76,3,?,3,0 5,69,4,4,3,1 4,58,2,1,3,0 5,62,4,3,3,1 4,56,4,4,3,0 4,64,1,1,3,0 4,32,2,1,3,0 5,48,?,4,?,1 5,59,4,4,2,1 4,52,1,1,3,0 4,63,4,4,3,0 5,67,4,4,3,1 5,61,4,4,3,1 5,59,4,5,3,1 5,52,4,3,3,1 4,35,4,4,3,0 5,77,3,3,3,1 5,71,4,3,3,1 5,63,4,3,3,1 4,38,2,1,2,0 5,72,4,3,3,1 4,76,4,3,3,1 4,53,3,3,3,0 4,67,4,5,3,0 5,69,2,4,3,1 4,54,1,1,3,0 2,35,2,1,2,0 5,68,4,3,3,1 4,68,4,4,3,0 4,67,2,4,3,1 3,39,1,1,3,0 4,44,2,1,3,0 4,33,1,1,3,0 4,60,?,4,3,0 4,58,1,1,3,0 4,31,1,1,3,0 3,23,1,1,3,0 5,56,4,5,3,1 4,69,2,1,3,1 6,63,1,1,3,0 4,65,1,1,3,1 4,44,2,1,2,0 4,62,3,3,3,1 4,67,4,4,3,1 4,56,2,1,3,0 4,52,3,4,3,0 4,43,1,1,3,1 4,41,4,3,2,1 4,42,3,4,2,0 3,46,1,1,3,0 5,55,4,4,3,1 5,58,4,4,2,1 5,87,4,4,3,1 4,66,2,1,3,0 0,72,4,3,3,1 5,60,4,3,3,1 5,83,4,4,2,1 4,31,2,1,3,0 4,53,2,1,3,0 4,64,2,3,3,0 5,31,4,4,2,1 5,62,4,4,2,1 4,56,2,1,3,0 5,58,4,4,3,1 4,67,1,4,3,0 5,75,4,5,3,1 5,65,3,4,3,1 5,74,3,2,3,1 4,59,2,1,3,0 4,57,4,4,4,1 4,76,3,2,3,0 4,63,1,4,3,0 4,44,1,1,3,0 4,42,3,1,2,0 4,35,3,?,2,0 5,65,4,3,3,1 4,70,2,1,3,0 4,48,1,1,3,0 4,74,1,1,1,1 6,40,?,3,4,1 4,63,1,1,3,0 5,60,4,4,3,1 5,86,4,3,3,1 4,27,1,1,3,0 4,71,4,5,2,1 5,85,4,4,3,1 4,51,3,3,3,0 6,72,4,3,3,1 5,52,4,4,3,1 4,66,2,1,3,0 5,71,4,5,3,1 4,42,2,1,3,0 4,64,4,4,2,1 4,41,2,2,3,0 4,50,2,1,3,0 4,30,1,1,3,0 4,67,1,1,3,0 5,62,4,4,3,1 4,46,2,1,2,0 4,35,1,1,3,0 4,53,1,1,2,0 4,59,2,1,3,0 4,19,3,1,3,0 5,86,2,1,3,1 4,72,2,1,3,0 4,37,2,1,2,0 4,46,3,1,3,1 4,45,1,1,3,0 4,48,4,5,3,0 4,58,4,4,3,1 4,42,1,1,3,0 4,56,2,4,3,1 4,47,2,1,3,0 4,49,4,4,3,1 5,76,2,5,3,1 5,62,4,5,3,1 5,64,4,4,3,1 5,53,4,3,3,1 4,70,4,2,2,1 5,55,4,4,3,1 4,34,4,4,3,0 5,76,4,4,3,1 4,39,1,1,3,0 2,23,1,1,3,0 4,19,1,1,3,0 5,65,4,5,3,1 4,57,2,1,3,0 5,41,4,4,3,1 4,36,4,5,3,1 4,62,3,3,3,0 4,69,2,1,3,0 4,41,3,1,3,0 3,51,2,4,3,0 5,50,3,2,3,1 4,47,4,4,3,0 4,54,4,5,3,1 5,52,4,4,3,1 4,30,1,1,3,0 3,48,4,4,3,1 5,?,4,4,3,1 4,65,2,4,3,1 4,50,1,1,3,0 5,65,4,5,3,1 5,66,4,3,3,1 6,41,3,3,2,1 5,72,3,2,3,1 4,42,1,1,1,1 4,80,4,4,3,1 0,45,2,4,3,0 4,41,1,1,3,0 4,72,3,3,3,1 4,60,4,5,3,0 5,67,4,3,3,1 4,55,2,1,3,0 4,61,3,4,3,1 4,55,3,4,3,1 4,52,4,4,3,1 4,42,1,1,3,0 5,63,4,4,3,1 4,62,4,5,3,1 4,46,1,1,3,0 4,65,2,1,3,0 4,57,3,3,3,1 4,66,4,5,3,1 4,45,1,1,3,0 4,77,4,5,3,1 4,35,1,1,3,0 4,50,4,5,3,1 4,57,4,4,3,0 4,74,3,1,3,1 4,59,4,5,3,0 4,51,1,1,3,0 4,42,3,4,3,1 4,35,2,4,3,0 4,42,1,1,3,0 4,43,2,1,3,0 4,62,4,4,3,1 4,27,2,1,3,0 5,?,4,3,3,1 4,57,4,4,3,1 4,59,2,1,3,0 5,40,3,2,3,1 4,20,1,1,3,0 5,74,4,3,3,1 4,22,1,1,3,0 4,57,4,3,3,0 4,57,4,3,3,1 4,55,2,1,2,0 4,62,2,1,3,0 4,54,1,1,3,0 4,71,1,1,3,1 4,65,3,3,3,0 4,68,4,4,3,0 4,64,1,1,3,0 4,54,2,4,3,0 4,48,4,4,3,1 4,58,4,3,3,0 5,58,3,4,3,1 4,70,1,1,1,0 5,70,1,4,3,1 4,59,2,1,3,0 4,57,2,4,3,0 4,53,4,5,3,0 4,54,4,4,3,1 4,53,2,1,3,0 0,71,4,4,3,1 5,67,4,5,3,1 4,68,4,4,3,1 4,56,2,4,3,0 4,35,2,1,3,0 4,52,4,4,3,1 4,47,2,1,3,0 4,56,4,5,3,1 4,64,4,5,3,0 5,66,4,5,3,1 4,62,3,3,3,0 ", na.strings="?", col.names=c("BIRADS","Age","Shape","Margin","Density","Severity")) # remove rows with missing items mydata = na.omit(mydata) # fix issues in the BIRADS column (all values meant to be between 1 and 5) mydata$BIRADS[mydata$BIRADS==0] = 1 mydata$BIRADS[mydata$BIRADS==55] = 5 # discretise age mydata$Age = cut(mydata$Age, breaks=c(0,45,55,75,Inf),labels=FALSE) # turn all columns into factors for (name in names(mydata)) mydata[,name] = factor(mydata[,name]) # return selected columns in order mydata[,c("BIRADS", "Shape", "Margin", "Density", "Age", "Severity")] } # examples ############################################################################### test.naive = function() { mammo = getdata() myclassifier = classifier.naive2(0) model = myclassifier$trainer(mammo, 1:5, 6) # correct case testrow = mammo[6,] print(testrow) print(myclassifier$tester(model, testrow)) # incorrect case testrow = mammo[5,] print(testrow) print(myclassifier$tester(model, testrow)) # cross validation print(kfcv.classifier(mammo, 1:5, 6, myclassifier)) } test.credal = function() { # set up the classifier myclassifier = classifier.composed( list(classifier.naive2(0), classifier.naive2(1), # naive with laplace correction classifier.credal(2))) # test with just first 30 observations to get some interesting effects mammo = getdata()[1:30,] # predict severity (column 6) from all attributes print(kfcv.classifier(mammo, 1:5, 6, myclassifier)) # predict severity from all attributes except BIRADS (column 1) print(kfcv.classifier(mammo, 2:5, 6, myclassifier)) # predict severity from BIRADS only print(kfcv.classifier(mammo, 1, 6, myclassifier)) # test with full data mammo = getdata() # predict BIRADS from other attributes print(kfcv.classifier(mammo, 2:5, 1, myclassifier)) }