### Example Code ---------------------------------------------- library(quanteda) library(SnowballC) library(gtools) library(refset) library(randomForest) setwd("C:/Work/Papers/ML") # change this line to appropriate Working directory set.seed(324789632) # important for replicability of results # Read Data ------------------------------------------------------------------------------------------------------- d <- read.delim("SL.txt", stringsAsFactors = FALSE, na.strings = ".") # Import data from file in Working directory d <- d[! is.na(d$level),] # Remove observations without level classification # Create corpus (cps) and document-feature-matrix ------------------------------------------------------------------------------------------- cps <- corpus(d$message) mystop <- c(".", ",", "!", "/", "(", ")", "-", ":", "'", "?", "%", "a", "b", "a's", "b's", "as", "bs", "A", "B", "black", "white", stopwords("english")) # dfmat <- dfm(cps, remove = mystop, stem = TRUE) dfmat <- dfmat[,colSums(dfmat) >= 5] # Create Word Cloud plot topfeatures(dfmat[d$level<0.5], 100) png(file = "cloudSL.png", width = 600, height = 600, pointsize=24) textplot_wordcloud(dfmat[d$level<0.5], random_order = FALSE) dev.off() # Create test and training data ----------------------------------------------------------------------------------- train.prop <- .7 train.rows <- test.rows <- numeric(0) for (tm in unique(d$treatment)) { rows <- which(d$treatment == tm) nr <- length(rows) train.rows <- c(train.rows, sample(rows, floor(nr * train.prop))) test.rows <- c(test.rows, setdiff(rows, train.rows)) } train.rows <- train.rows[ ! is.na(d$level[train.rows]) ] test.rows <- test.rows[ ! is.na(d$level[test.rows]) ] dtr %r% d[train.rows,] dtest %r% d[test.rows,] dfmtr <- dfmat[train.rows,] dfmtest <- dfmat[test.rows,] ### random forest --------------------------------------------------------------------------------------------------- ## training # Regression rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE) # Classification rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE) ## testing --------------------------------------------------------------------------------------------------------- # Regression predict(rf1, as.matrix(dfmtest)) -> rf1p round(rf1p, 0) -> rf1pround print(tab <- table(rf1pround, dtest$level)) cor.test(rf1p, dtest$level) print(summary(lm(dtest$level ~ 0 + rf1p))) # Classification predict(rf2, as.matrix(dfmtest)) -> rf2p rf2pchar <- as.character(rf2p) rf2pnum <- as.numeric(rf2pchar) print(tab <- table(rf2p, dtest$level)) cor.test(rf2pnum, dtest$level) print(summary(lm(dtest$level ~ 0 + rf2pnum))) ## Importance ------------------------------------------------------------------------------------------------------ # Regression impo <- round(importance(rf1), 3) impoMSE <- impo[,c(1)] barlabels <- names(impoMSE)[order(impoMSE, decreasing=T)[1:30]] impoMSE <- impoMSE[order(impoMSE, decreasing=T)[1:30]] par(mar = c(5,6,4,2) + 0.2) barplot(impoMSE, horiz = TRUE, names.arg = barlabels, las=1, xlab="Importance") dev.off() # Classification impo <- round(importance(rf2), 3) impoGini <- impo[,c(6)] barlabels <- names(impoGini)[order(impoGini, decreasing=T)[1:30]] impoGini <- impoGini[order(impoGini, decreasing=T)[1:30]] par(mar = c(5,6,4,2) + 0.2) barplot(impoGini, horiz = TRUE, names.arg = barlabels, las=1, xlab="Importance") dev.off() ## Cross-validation -------------------------------------------------------------------------------- chunks <- 10 rf1pred <- numeric(0) rf2pred <- numeric(0) for (run in 1:chunks) { train.rows <- test.rows <- numeric(0) for (tm in unique(d$treatment)) { rows <- which(d$treatment == tm) nr <- length(rows) test.rows <- c(test.rows, rows[c(ceiling(((run-1)*nr/(chunks-0.00001))):floor((run*nr/chunks)))]) train.rows <- c(train.rows, setdiff(rows, test.rows)) } train.rows <- train.rows[ ! is.na(d$level[train.rows]) ] test.rows <- test.rows[ ! is.na(d$level[test.rows]) ] dtr %r% d[train.rows,] dtest %r% d[test.rows,] dfmtr <- dfmat[train.rows,] dfmtest <- dfmat[test.rows,] cpstr <- subset(cps, 1:999 %in% train.rows) cpstest <- subset(cps, 1:999 %in% test.rows) rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE) rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE, classwt = ) predict(rf1, as.matrix(dfmtest)) -> rf1p predict(rf2, as.matrix(dfmtest)) -> rf2p rf1pred <- c(rf1pred, rf1p) rf2pred <- factor(c(as.character(rf2pred), as.character(rf2p))) if (run==1){ dtestX <- dtest } else { dtestX <- mapply(c, dtestX, dtest, SIMPLIFY=FALSE) } } # Random forest (regression) cor.test(rf1pred, dtestX$level) print(summary(lm(dtestX$level ~ 0 + rf1pred))) round(rf1pred, 0) -> rf1predround round(dtestX$level, 0) -> levelround print(tab <- table(rf1predround, levelround)) countXr <- length(which(levelround==rf1predround)) # Random forest (classification) print(tab <- table(rf2pred, levelround)) rf2predchar <- as.character(rf2pred) rf2prednum <- as.numeric(rf2predchar) cor.test(rf2prednum, dtestX$level) print(summary(lm(dtestX$level ~ 0 + rf2prednum))) countXc <- length(which(rf2prednum==dtestX$level)) # permutation test of cross-validation -------------------------------------------------------------------------------- permutations <- 20 # change to 2000 or number of your choice after a test run with 20 CountR <- numeric() CountC <- numeric() for (perm in 1:permutations) { chunks <- 10 rf1pred <- numeric(0) rf2pred <- numeric(0) for (run in 1:chunks) { train.rows <- test.rows <- numeric(0) for (tm in unique(d$treatment)) { rows <- which(d$treatment == tm) nr <- length(rows) test.rows <- c(test.rows, rows[c(ceiling(((run-1)*nr/(chunks-0.00001))):floor((run*nr/chunks)))]) train.rows <- c(train.rows, setdiff(rows, test.rows)) } train.rows <- train.rows[ ! is.na(d$level[train.rows]) ] test.rows <- test.rows[ ! is.na(d$level[test.rows]) ] d$level <- permute(d$level) # Permutation of labels: level information dtr %r% d[train.rows,] dtest %r% d[test.rows,] dfmtr <- dfmat[train.rows,] dfmtest <- dfmat[test.rows,] cpstr <- subset(cps, 1:999 %in% train.rows) cpstest <- subset(cps, 1:999 %in% test.rows) library(randomForest) rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE) rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE, classwt = ) predict(rf1, as.matrix(dfmtest)) -> rf1p predict(rf2, as.matrix(dfmtest)) -> rf2p rf1pred <- c(rf1pred, rf1p) rf2pred <- factor(c(as.character(rf2pred), as.character(rf2p))) if (run==1){ dtestX <- dtest } else { dtestX <- mapply(c, dtestX, dtest, SIMPLIFY=FALSE) } } # Random forest (regression) cor.test(rf1pred, dtestX$level) print(summary(lm(dtestX$level ~ 0 + rf1pred))) round(rf1pred, 0) -> rf1predround round(dtestX$level, 0) -> levelround tab <- table(rf1predround, levelround) #write.table(tab, file = "SLtablePredictionXRegression.csv", sep = ",", col.names = NA) count1r <- length(which(levelround==rf1predround)) CountR <- c(CountR,count1r) # Random forest (classification) tab <- table(rf2pred, levelround) rf2predchar <- as.character(rf2pred) rf2prednum <- as.numeric(rf2predchar) cor.test(rf2prednum, dtestX$level) print(summary(lm(dtestX$level ~ 0 + rf2prednum))) #write.table(tab, file = "SLtablePredictionXClassification.csv", sep = ",", col.names = NA) count1c <- length(which(rf2prednum==dtestX$level)) CountC <- c(CountC,count1c) print(perm) } table(CountC[1:perm]) # Compare with countXc table(CountR[1:perm]) # Compare with countXr