# Author: Johannes Liegl # Date: 18.10.2009 # License: LGPL library("tm") library("class") library("e1071") library("slam") library("SparseM") # some functions topicFilter <- function (object, s, topicOfDoc) { query.df <- prescindMeta(object, c("Topics")) attach(query.df) boolFilter <- c() i <- 1 while (i <= length(Topics)) { res <- c(s) %in% Topics[[i]] boolFilter <- c(boolFilter, res) i <- i + 1 } if (!topicOfDoc) boolFilter <- unlist(lapply(boolFilter,`!`)) try(result <- rownames(query.df) %in% row.names(query.df[boolFilter,])) detach(query.df) result } attr(topicFilter,"doclevel") <- FALSE mColRow <- function (m, mRow, mCol) { retVal <- m[(dim(m)[1]*(mCol-1)) + mRow] } mColRowSparse <- function (m, mRow, mCol) { retVal <- m[(dim(m)[1]*(mCol-1)) + mRow] } weightTfc <- function(m) { isDTM <- inherits(m, "DocumentTermMatrix") if (isDTM) m <- t(m) sm <- m class(sm) <- "simple_triplet_matrix" idfVec <- log2(nDocs(m)/rowSums(m > 0)) curCol <- 1 rows <- dim(m)[1] cols <- dim(m)[2] normVec <- rep(1, cols) while (curCol <= cols) { curRow <- 1 curNormSum <- 0 while (curRow <= rows) { tempVal <- mColRow(sm, curRow,curCol) if (!is.na(tempVal)) { print(paste(tempVal," * ", idfVec[curRow], " ")) tempVal <- tempVal * idfVec[curRow] tempVal <- tempVal * tempVal } else { tempVal <- 0 } curNormSum <- curNormSum + tempVal curRow <- curRow + 1 } curNormSum <- curNormSum^0.5 if ( curNormSum == 0) curNormSum = 0.001 normVec[curCol] <- 1 / curNormSum curCol <- curCol + 1 } m <- m * idfVec m <- t(m) m <- m * normVec m <- t(m) if (isDTM) t(m) else m } weightTfc <- WeightFunction(weightTfc, "tfc weighting", "tfc") weightTfcSparse <- function(m) { isDTM <- inherits(m, "DocumentTermMatrix") if (isDTM) m <- t(m) sm <- m class(sm) <- "simple_triplet_matrix" idfVec <- log2(nDocs(m)/rowSums(m > 0)) cols <- dim(m)[2] curCol <- sm$j[1] curRow <- sm$i[1] curNormSum <- 0 k <- 1 normVec <- rep(1, cols) while (k <= length(sm$j)) { tempVal <- sm$v[k] tempVal <- tempVal * idfVec[curRow] tempVal <- tempVal * tempVal curNormSum <- curNormSum + tempVal k <- k + 1 if (k > length(sm$j) || curCol != sm$j[k]) { curNormSum <- curNormSum^0.5 # just in case, should actually never be 0 if (curNormSum == 0) curNormSum = 0.001 normVec[curCol] <- 1 / curNormSum curNormSum <- 0 } curCol <- sm$j[k] curRow <- sm$i[k] } m <- m * idfVec m <- t(m) m <- m * normVec m <- t(m) if (isDTM) t(m) else m } weightTfcSparse <- WeightFunction(weightTfcSparse, "tfc weighting sparse", "tfc") transformDocumentTermMatrixToCompressedSparseRowMatrix <- function (m) { isTDM <- inherits(m, "TermDocumentMatrix") if (isTDM) m <- t(m) mV <- as.numeric(m$v) k <- 1 tmpRa <- mV tmpJa <- m$j tmpIa <- c() colStart <- 0 while (k < length(m$i)) { tempVal <- m$i[k] if (tempVal > colStart) { while (colStart < tempVal) { tmpIa <- c(tmpIa, k) colStart <- colStart + 1 } } k <- k + 1 } tmpIa <- c(tmpIa, length(m$i) + 1) csr <- new("matrix.csr", ra = tmpRa, ja = as.integer(tmpJa), ia = as.integer(tmpIa), dimension = as.integer(c(dim(m)[1],dim(m)[2]))) if (isTDM) t(csr) else csr } maxDocNumPerTopicFilter <- function (object, maxDocsPerTopic) { query.df <- prescindMeta(object, c("Topics")) attach(query.df) boolFilter <- c() hash <- list() ctr <- 0 i <- 1 res <- FALSE while (i <= length(Topics)) { if (length(Topics[[i]]) > 0) { maxDocsReached <- FALSE j <- 1 while (j <= length(Topics[[i]])) { curTopic <- Topics[[i]][j] # initialize the hash for the current topic if (is.null(hash[[curTopic]])) { hash[[curTopic]] <- 0 } if ((hash[[curTopic]] + 1) > maxDocsPerTopic) { maxDocsReached <- TRUE } j <- j + 1 } if (!maxDocsReached) { j <- 1 while (j <= length(Topics[[i]])) { curTopic <- Topics[[i]][j] hash[[curTopic]] <- (hash[[curTopic]] + 1) j <- j + 1 } res <- c(TRUE) ctr <- ctr +1 } else { res <- c(FALSE) } } else { # we don't take documents without a topic res <- c(FALSE) } boolFilter <- c(boolFilter, res) i <- i + 1 } try(result <- rownames(query.df) %in% row.names(query.df[boolFilter,])) detach(query.df) result } attr(maxDocNumPerTopicFilter,"doclevel") <- FALSE precRecallPlot <- function(diagname, subtitle) { z.min <- -0.1 z.max <- -z.min plot(NA, NA, xlim=c(0, 40), ylim=c(z.min, 1), type="n", xaxt="n", yaxt="n", frame = FALSE, xlab = "Topics", ylab = "Precision - Recall", main=diagname, sub=subtitle) axis(side = 1, labels=NA, at = c(2, 6, 10, 14, 18, 22, 26, 30, 34, 38), line = 0, tick=TRUE, outer = FALSE) text(x=c(2, 6, 10, 14, 18, 22, 26, 30, 34, 38), y=rep(z.min-0.1, 10), labels=c("earn", "acq", "money-fx", "grain", "crude", "trade", "interest", "ship", "wheat", "corn"), xpd = NA, cex=1) axis(side=2, at=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), labels=NA, line = 0, tck = -0.01) text(x=rep(-3,11), y=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), labels=c(0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1), xpd = NA, cex=1) box() } plotPrecRec <- function(tfPrec, tfIdfPrec, tfcPrec, tfRec, tfIdfRec, tfcRec, diagname, subtitle) { precRecallPlot(diagname, subtitle) xVals <- c(2, 6, 10, 14, 18, 22, 26, 30, 34, 38) lines(xVals, tfPrec, col="black", type="o") lines(xVals, tfIdfPrec, col="blue", type="o") lines(xVals, tfcPrec, col="red", type="o") lines(xVals, tfRec, col="black", type="o", lty=2) lines(xVals, tfIdfRec, col="blue", type="o", lty=2) lines(xVals, tfcRec, col="red", type="o", lty=2) } # commands to be executed baseDir <- "C:/coding/workspace/SVMLearning/InRPlease/" reuters21578TrainDir <- DirSource(paste(baseDir,"reuters-21578-xml-train",sep="")) reuters21578TestDir <- DirSource(paste(baseDir,"reuters-21578-xml-test",sep="")) reuters21578TrainCorpus <- Corpus(reuters21578TrainDir, readerControl = list(reader = readReut21578XMLasPlain)) reuters21578TestCorpus <- Corpus(reuters21578TestDir, readerControl = list(reader = readReut21578XMLasPlain)) trainCorpTM <- DocumentTermMatrix(reuters21578TrainCorpus, control = list(stopwords = TRUE, weighting = weightTf, minDocFrequency = 3)) dict <- Dictionary(trainCorpTM) topic <- "earn" maxDocs <- 300 # Train Corpus Split for a given topic-Class (Pos/Neg) topicCorpusTrainPos <- tm_filter(reuters21578TrainCorpus, FUN=topicFilter, topic, topicOfDoc = TRUE, doclevel=FALSE) topicCorpusTrainNeg <- tm_filter(reuters21578TrainCorpus, FUN=topicFilter, topic, topicOfDoc = FALSE, doclevel=FALSE) topicCorpusTrainNegDocLimitPerTopic <- tm_filter(topicCorpusTrainNeg, FUN=maxDocNumPerTopicFilter, maxDocsPerTopic = 10, doclevel = FALSE) # merge the corpora - just take 200 of positive instances because of the SVM model size svmPosNegTrainCorpus <- c(topicCorpusTrainPos[1:min(maxDocs,length(topicCorpusTrainPos))], topicCorpusTrainNegDocLimitPerTopic) # construct corpus with one element containing all terms in the training corpus - needed for SVM docs <- c(paste(dict, collapse = " ")) vs <- VectorSource(docs) singleDocCorpus <- Corpus(vs) svmPosNegTrainCorpusI <- c(svmPosNegTrainCorpus, singleDocCorpus) # construct the matrix xx <- DocumentTermMatrix(svmPosNegTrainCorpusI, control = list(stopwords = TRUE, weighting = weightTfcSparse, dictionary = dict)) # strip of the artificial last document and learn a model xxi <- xx[1:dim(xx)[1]-1] xxi <- transformDocumentTermMatrixToCompressedSparseRowMatrix(xxi) # create the label vector for svm learning yPos <- rep(1, length(topicCorpusTrainPos[1:min(maxDocs,length(topicCorpusTrainPos))])) yNeg <- rep(-1, length(topicCorpusTrainNegDocLimitPerTopic)) y <- c(yPos, yNeg) model <- svm(xxi, y, type="C-classification", kernel = "linear", gamma = 1, degree = 2) # Evaluation # Test Corpus Split for a given topic-class (Pos/Neg) topicCorpusTestPos <- tm_filter(reuters21578TestCorpus, FUN=topicFilter, topic, topicOfDoc = TRUE, doclevel=TRUE) topicCorpusTestNeg <- tm_filter(reuters21578TestCorpus, FUN=topicFilter, topic, topicOfDoc = FALSE, doclevel=TRUE) topicCorpusTestNegDocLimitPerTopic <- tm_filter(topicCorpusTestNeg, FUN=maxDocNumPerTopicFilter, maxDocsPerTopic = 10, doclevel = FALSE) svmPosNegTestCorpus <- c(topicCorpusTestPos[1:min(maxDocs,length(topicCorpusTestPos))], topicCorpusTestNegDocLimitPerTopic) svmPosNegTestCorpusI <- c(svmPosNegTestCorpus, singleDocCorpus) # create the label vector for svm learning yTestPos <- rep(1, length(topicCorpusTestPos[1:min(maxDocs,length(topicCorpusTestPos))])) yTestNeg <- rep(-1, length(topicCorpusTestNegDocLimitPerTopic)) yTest <- c(yTestPos, yTestNeg) xxT <- DocumentTermMatrix(svmPosNegTestCorpusI, control = list(stopwords = TRUE, weighting = weightTfcSparse, dictionary = dict)) # Transform the simple triplet matrix into a compressed sparse row matrix xxTi <- xxT[1:dim(xxT)[1]-1] xxTi <- transformDocumentTermMatrixToCompressedSparseRowMatrix(xxTi) # apply the model on the test data svmPrediction <- predict(model, xxTi) # re-labelling: 1 -> topic and -1 -> notTopic notTopic <- paste('not', topic, sep = '') svmPredictionI <- svmPrediction svmPredictionI[which(svmPredictionI == -1)] = notTopic svmPredictionI[which(svmPredictionI == 1)] = topic yTestI <- yTest yTestI[which(yTestI == -1)] = notTopic yTestI[which(yTestI == 1)] = topic svmTable <- table(svm = factor(svmPredictionI, levels = c(topic, notTopic)), reuters=factor(yTestI, levels = c(topic,notTopic))) precision <- svmTable[1,1] / ( svmTable[1,1] + svmTable[1,2] + 0.0001) recall <- svmTable[1,1] / ( svmTable[1,1] + svmTable[2,1] + 0.0001) accuracy <- sum(diag(svmTable)) / sum(svmTable) print(paste('topic: ', "earn", 'accuracy: ', accuracy, 'precision: ', precision, 'recall: ', recall, " "))