### Example Code ----------------------------------------------
library(quanteda)
library(SnowballC) 
library(gtools)
library(refset)
library(randomForest)

setwd("C:/Work/Papers/ML") # change this line to appropriate Working directory

set.seed(324789632) # important for replicability of results

# Read Data -------------------------------------------------------------------------------------------------------

d <- read.delim("SL.txt", stringsAsFactors = FALSE, na.strings = ".") # Import data from file in Working directory
d <- d[! is.na(d$level),] # Remove observations without level classification

# Create corpus (cps) and document-feature-matrix -------------------------------------------------------------------------------------------

cps <- corpus(d$message)
mystop <- c(".", ",", "!", "/", "(", ")", "-", ":", "'", "?", "%", "a", "b", "a's", "b's", "as", "bs", "A", "B",  "black", "white", stopwords("english")) # 
dfmat <- dfm(cps, remove = mystop, stem = TRUE)
dfmat <- dfmat[,colSums(dfmat) >= 5]

# Create Word Cloud plot

topfeatures(dfmat[d$level<0.5], 100)
png(file = "cloudSL.png", width = 600, height = 600, pointsize=24)
textplot_wordcloud(dfmat[d$level<0.5], random_order = FALSE)
dev.off()

# Create test and training data -----------------------------------------------------------------------------------

train.prop <- .7
train.rows <- test.rows <- numeric(0)
for (tm in unique(d$treatment)) {
  rows <- which(d$treatment == tm)
  nr <- length(rows)
  train.rows <- c(train.rows, sample(rows, floor(nr * train.prop)))
  test.rows <- c(test.rows, setdiff(rows, train.rows))
}
train.rows <- train.rows[ ! is.na(d$level[train.rows]) ]
test.rows <- test.rows[ ! is.na(d$level[test.rows]) ]
dtr %r% d[train.rows,]
dtest %r% d[test.rows,]
dfmtr <- dfmat[train.rows,]
dfmtest <- dfmat[test.rows,]

### random forest ---------------------------------------------------------------------------------------------------
## training
# Regression
rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE)
# Classification
rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE)

## testing ---------------------------------------------------------------------------------------------------------
# Regression
predict(rf1, as.matrix(dfmtest)) -> rf1p
round(rf1p, 0) -> rf1pround

print(tab <- table(rf1pround, dtest$level))
cor.test(rf1p, dtest$level)
print(summary(lm(dtest$level ~ 0 + rf1p)))

# Classification
predict(rf2, as.matrix(dfmtest)) -> rf2p
rf2pchar <- as.character(rf2p)
rf2pnum <- as.numeric(rf2pchar)

print(tab <- table(rf2p, dtest$level))
cor.test(rf2pnum, dtest$level)
print(summary(lm(dtest$level ~ 0 + rf2pnum)))
	
## Importance ------------------------------------------------------------------------------------------------------
# Regression 
impo <- round(importance(rf1), 3)
impoMSE <- impo[,c(1)]
barlabels <- names(impoMSE)[order(impoMSE, decreasing=T)[1:30]]
impoMSE <- impoMSE[order(impoMSE, decreasing=T)[1:30]]
par(mar = c(5,6,4,2) + 0.2)
barplot(impoMSE, horiz = TRUE, names.arg = barlabels, las=1, xlab="Importance")
dev.off()

# Classification 
impo <- round(importance(rf2), 3)
impoGini <- impo[,c(6)]
barlabels <- names(impoGini)[order(impoGini, decreasing=T)[1:30]]
impoGini <- impoGini[order(impoGini, decreasing=T)[1:30]]
par(mar = c(5,6,4,2) + 0.2)
barplot(impoGini, horiz = TRUE, names.arg = barlabels, las=1, xlab="Importance")
dev.off()


## Cross-validation --------------------------------------------------------------------------------

chunks <- 10
rf1pred <- numeric(0)
rf2pred <- numeric(0)

for (run in 1:chunks) {

train.rows <- test.rows <- numeric(0)
for (tm in unique(d$treatment)) {
  rows <- which(d$treatment == tm)
  nr <- length(rows)
  test.rows <- c(test.rows,  rows[c(ceiling(((run-1)*nr/(chunks-0.00001))):floor((run*nr/chunks)))])
  train.rows <- c(train.rows, setdiff(rows, test.rows))
}
train.rows <- train.rows[ ! is.na(d$level[train.rows]) ]
test.rows <- test.rows[ ! is.na(d$level[test.rows]) ]
dtr %r% d[train.rows,]
dtest %r% d[test.rows,]
dfmtr <- dfmat[train.rows,]
dfmtest <- dfmat[test.rows,]
cpstr <- subset(cps, 1:999 %in% train.rows)
cpstest <- subset(cps, 1:999 %in% test.rows)

rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE)
rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE, classwt = )

predict(rf1, as.matrix(dfmtest)) -> rf1p
predict(rf2, as.matrix(dfmtest)) -> rf2p
rf1pred <- c(rf1pred, rf1p)
rf2pred <- factor(c(as.character(rf2pred), as.character(rf2p)))
if (run==1){
dtestX <- dtest
} else {
dtestX <- mapply(c, dtestX, dtest, SIMPLIFY=FALSE)
}

}

# Random forest (regression)
cor.test(rf1pred, dtestX$level)
print(summary(lm(dtestX$level ~ 0 + rf1pred)))
round(rf1pred, 0) -> rf1predround
round(dtestX$level, 0) -> levelround
print(tab <- table(rf1predround, levelround))
countXr <- length(which(levelround==rf1predround))

# Random forest (classification)
print(tab <- table(rf2pred, levelround))
rf2predchar <- as.character(rf2pred)
rf2prednum <- as.numeric(rf2predchar)
cor.test(rf2prednum, dtestX$level)
print(summary(lm(dtestX$level ~ 0 + rf2prednum)))
countXc <- length(which(rf2prednum==dtestX$level))

# permutation test of cross-validation --------------------------------------------------------------------------------

permutations <- 20  # change to 2000 or number of your choice after a test run with 20
CountR <- numeric()
CountC <- numeric()

for (perm in 1:permutations) {

chunks <- 10
rf1pred <- numeric(0)
rf2pred <- numeric(0)

for (run in 1:chunks) {

train.rows <- test.rows <- numeric(0)
for (tm in unique(d$treatment)) {
  rows <- which(d$treatment == tm)
  nr <- length(rows)
  test.rows <- c(test.rows,  rows[c(ceiling(((run-1)*nr/(chunks-0.00001))):floor((run*nr/chunks)))])
  train.rows <- c(train.rows, setdiff(rows, test.rows))
}
train.rows <- train.rows[ ! is.na(d$level[train.rows]) ]
test.rows <- test.rows[ ! is.na(d$level[test.rows]) ]
d$level <- permute(d$level)    # Permutation of labels: level information
dtr %r% d[train.rows,]
dtest %r% d[test.rows,]
dfmtr <- dfmat[train.rows,]
dfmtest <- dfmat[test.rows,]
cpstr <- subset(cps, 1:999 %in% train.rows)
cpstest <- subset(cps, 1:999 %in% test.rows)

library(randomForest)
rf1 <- randomForest(x = as.matrix(dfmtr), y = dtr$level, keep.forest = TRUE, importance = TRUE, proximity = TRUE)
rf2 <- randomForest(x = as.matrix(dfmtr), y = factor(dtr$level), keep.forest = TRUE, importance = TRUE, classwt = )

predict(rf1, as.matrix(dfmtest)) -> rf1p
predict(rf2, as.matrix(dfmtest)) -> rf2p
rf1pred <- c(rf1pred, rf1p)
rf2pred <- factor(c(as.character(rf2pred), as.character(rf2p)))
if (run==1){
dtestX <- dtest
} else {
dtestX <- mapply(c, dtestX, dtest, SIMPLIFY=FALSE)
}

}

# Random forest (regression)
cor.test(rf1pred, dtestX$level)
print(summary(lm(dtestX$level ~ 0 + rf1pred)))
round(rf1pred, 0) -> rf1predround
round(dtestX$level, 0) -> levelround
tab <- table(rf1predround, levelround)
#write.table(tab, file = "SLtablePredictionXRegression.csv", sep = ",", col.names = NA)
count1r <- length(which(levelround==rf1predround))
CountR <- c(CountR,count1r)

# Random forest (classification)
tab <- table(rf2pred, levelround)
rf2predchar <- as.character(rf2pred)
rf2prednum <- as.numeric(rf2predchar)
cor.test(rf2prednum, dtestX$level)
print(summary(lm(dtestX$level ~ 0 + rf2prednum)))
#write.table(tab, file = "SLtablePredictionXClassification.csv", sep = ",", col.names = NA)
count1c <- length(which(rf2prednum==dtestX$level))
CountC <- c(CountC,count1c)
print(perm)
}

table(CountC[1:perm])   # Compare with 
countXc

table(CountR[1:perm])   # Compare with 
countXr