# Replication code for "Quantifying Economic Policy"

library(readtext)
library(quanteda)
library(stringi)
library(stm)

# The main results on the command papers corpus make use of the all_dfmat document feature matrix, included in the RData replication file.
# The main model on the commend papers is all_stm, defined below. 
# Intermediate steps for the creation of the document feature matrix and of the various results are included in the commented lines. 
# The replication code for each figure and table in the body of the paper is indicated throughout the script.

# all_cmd <- readtext("/Volumes/SAMSUNG/command papers/*/*.pdf",
#                     docvarsfrom="filepaths",
#                     dvsep="/",
#                     verbosity=3
#                     )
# 
# 
# all_cmd$text <- stri_replace_all_fixed(all_cmd$text, "\n", " ")
# 
# all_cmd_cut <- all_cmd
# 
# all_cmd_cut$text <- substr(all_cmd_cut$text, 1, 20000*5)

# The below removes the international treaties from the sample: 
# all_cmd_notr <- all_cmd_cut[!str_detect(tolower(substr(all_cmd_cut$text, 1, 300*5)), "commonwealth"),]
# all_cmd_notr <- all_cmd_notr[!str_detect(tolower(substr(all_cmd_notr$text, 1, 300*5)), "treaty"),]
# 
# The below indicates the working papers:
# wp <- which(str_detect(tolower(all_cmd40k_notr$text), "this white paper"))
# 
# all_cmd_wp <- all_cmd_notr[wp,]
# 
# 
# Make the text a corpus:
# 
# all <- corpus(all_cmd_notr)
# all_wp <- corpus(all_cmd_wp)
# 
# all_toks <- tokens(all,
#                remove_numbers=TRUE,
#                remove_punct=TRUE,
#                remove_symbols=TRUE,
#                remove_separators=TRUE,
#                remove_url = TRUE,
#                split_hyphens=TRUE,
#                preserve_tags=FALSE,
#                verbose=TRUE)
# 
# rm(all_toks)
# all_toks <- tokens_tolower(all_toks)
# 
# all_toks <- tokens_remove(all_toks, pattern = stopwords('en'), min_nchar=3)
# 
# all_toks <- tokens_remove(all_toks, pattern = c("house", "of", "commons", "parliamentary", "papers", "online", 
#                                                 "copyright", "proquest", "llc", "information", "learning", "company", "all", "rights", "reserved"))
# 
# 

# Main document-feature matrix, included in the RData workspace:
all_dfmat <- dfm(all_toks)

# Create the metadata required for the covariates:
metadata <- data.frame(year=docvars(all_dfmat)$docvar5, gov=docvars(all_dfmat)$docvar5, post10=docvars(all_dfmat)$docvar5)

metadata$gov[metadata$year<1990] <- "MT"
metadata$gov[metadata$year>=1990 & metadata$year<1997] <- "JM"
metadata$gov[metadata$year>=1997 & metadata$year<2007] <- "TB"
metadata$gov[metadata$year>=2007 & metadata$year<2010] <- "GB"
metadata$gov[metadata$year>=2010 & metadata$year<2016] <- "DC"
metadata$gov[metadata$year>=2016 & metadata$year<2019] <- "TM"
metadata$gov[metadata$year>=2019 ] <- "BJ"

# Main stm model for command papers below
# Table 1

all_stm <- stm(all_dfmat, K=20, prevalence=~s(year,10)+gov, data=metadata, seed=123)



summary(all_stm)
labels_all_stm <- c("Inquiries", "Immigration", "Consultations", "Economic outlook", "Devolution / intl.",
                    "Research and innovation", "Health and safety", "Bills", "Public sector pay", 
                    "Energy and climate", "Security", "Justice", "Local govt", "Public spending", "Health", 
                    "Education", "Social insurance", "Crime", "Markets and competition", "Public sector performance")

ec <- c(4, 6, 9, 10, 14, 15, 16, 17, 19, 20)


# Alternative K presented in the appendix:
all_stm_15 <- stm(all_dfmat, K=15, prevalence=~s(year, 10)+gov, data=metadata, seed=123)

all_stm_25 <- stm(all_dfmat, K=25, prevalence=~s(year,10)+gov, data=metadata, seed=123) # rerun




# Marginal effects for all_stm, used for time results in body:

estd <- estimateEffect(~s(year, 10), all_stm, metadata=metadata, uncertainty="Global", nsims=25)

# Marginal effects for all_stm, used for government effects, table 3:
estd_gov <- estimateEffect(~gov, all_stm, metadata=metadata, uncertainty="Global", nsims=25)

estd_gov_q <- estimateEffect(~gov+s(year, 3), all_stm, metadata=metadata, uncertainty="Global", nsims=25)

# continuous plots - Figure 1:
plot.estimateEffect(estd, topics=c(9), covariate="year",  method="continuous", printlegend=F, npoints=100, nsims=50)

plot.estimateEffect(estd, topics=c(15, 16, 20), covariate="year",  method="continuous", printlegend=F, npoints=100, nsims=50, ylim=c(0, .12))

plot.estimateEffect(estd, topics=c(4, 6, 10), covariate="year",  method="continuous", printlegend=F, npoints=100, nsims=50, ylim=c(0, .12))

plot.estimateEffect(estd, topics=c(14, 17, 19), covariate="year",  method="continuous", printlegend=F, npoints=100, nsims=50, ylim=c(0, .12))

# Figure 2:
# Difference 85 to 20
par(mar = c(5.1, 6.2, 4.1, 2.1)) # Set the margin on all sides to 2
e1 <- plot.estimateEffect(estd, covariate="year", topics=ec,
                    method="difference",
                    cov.value1 = 2020, cov.value2 = 1985, labeltype="custom", custom.labels = labels_all_stm[ec])
order(unlist(e1$means))



# Difference 85 to 2005
par(mar = c(5.1, 4.1, 4.1, 2.1)) 
e2 <- plot.estimateEffect(estd, covariate="year", topics=ec,
                          method="difference",
                          cov.value1 = 2005, cov.value2 = 1985, labeltype="custom", custom.labels = labels_all_stm[ec])
order(unlist(e2$means))


# Illustrate perspectives between topics - Figure 3
plot(all_stm, type = "perspectives", topics = c(19, 6), plabels=c("Markets and competition", "Research and innovation"), text.cex=.9)

plot(all_stm, type = "perspectives", topics = c(14, 4), plabels=c("Public spending", "Economic outlook"), text.cex=1)

plot(all_stm, type = "perspectives", topics = c(14, 20), plabels=c("Public spending", "P. s. performance"), text.cex=.9)

# Gov effecs - Table 2
plot.estimateEffect(estd_gov, topics=c(10), covariate="gov",  method="pointestimate", printlegend=F, nsims=50)


pms <- c("MT", "JM", "TB", "GB", "DC", "TM", "BJ" )

par(mfrow=c(2, 5))
for (i in ec){
  plot.estimateEffect(estd_gov, covariate="gov", method="pointestimate", topics=i, printlegend=F, main=labels_all_stm[i], verbose.labels=F, labeltype="custom", custom.labels=pms)
}

# differences
par(mfrow=c(3, 2))
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "JM", cov.value2="MT", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Major vs Thatcher")
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "TB", cov.value2="JM", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Blair vs Major")
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "GB", cov.value2="TB", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Brown vs Blair")
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "DC", cov.value2="GB", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Cameron vs Brown")
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "TM", cov.value2="DC", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="May vs Cameron")
plot.estimateEffect(estd_gov, topics=ec, covariate="gov",  method="difference", cov.value1 = "BJ", cov.value2="TM", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Johnson vs May")

dev.off()

# differences with a 3 trend
par(mfrow=c(3, 2))
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "JM", cov.value2="MT", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Major vs Thatcher")
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "TB", cov.value2="JM", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Blair vs Major")
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "GB", cov.value2="TB", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Brown vs Blair")
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "DC", cov.value2="GB", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Cameron vs Brown")
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "TM", cov.value2="DC", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="May vs Cameron")
plot.estimateEffect(estd_gov_q, topics=ec, covariate="gov",  method="difference", cov.value1 = "BJ", cov.value2="TM", printlegend=F, nsims=50, labeltype = "custom", custom.labels=labels_all_stm[ec], main="Johnson vs May")


# Results on groups of agents, Appendix 2.5:
tf <- topfeatures(all_dfmat, n=2000)

other <- c("people", "management", "members", "person", "organisations", "individuals", "parties", "chief", "offenders", 
"providers", "organisation", "party", "respondents", "prisoners", "association", 
"partners", "team", "professor", "adult", "media", "military", "leadership", "managers", "university", 
"establishments", "households", "boards", "adults", "offender", 
"professionals", "representatives", "citizens", "administrations",
"applicants", "household", "universities", "institute", "colleges", "people's", "agents", "chair", "leaders", "expert", "prisoner", 
 "associations", "female", "auditor", "inspector",  "brigades", "male")

business <- c("sector", "business", "industry", "companies", "employers", "businesses", "sectors", "bank",
"suppliers", "corporate", "employer", "banks", "corporation", "manufacturers", "firm")

employees <- c("staff", "officers", "officer", "teachers", "workers", "labour", "personnel", "workforce", 
"union", "employees", "doctors", "specialist", "claimants", "consultants", "teacher", "manpower", "practitioners",
"dentists", "claimant", "unions", "unemployed", "poor", "pensioners", "tenants")

employees_lim <- c("staff",  "workers", "labour", "personnel", "workforce", 
               "union", "employees",  "claimants", "manpower",  "claimant", "unions", "unemployed", "poor", "pensioners", "tenants")

consumers <- c("children", "community", "young", "child", "communities", "family", "parents", "customers", 
"families", "patients", "consumer", "victims", "consumers", "users", "disabled", 
"customer", "stakeholders", "patient", "children's", "students", "youth", "pupils", "passenger", 
"parent", "carers", "student", "victim")
  
business_ind <- which(colnames(all_dfmat)%in%business)
employees_ind <- which(colnames(all_dfmat)%in%employees)
consumers_ind <- which(colnames(all_dfmat)%in%consumers)
employees_lim_ind <- which(colnames(all_dfmat)%in%employees_lim)
other_ind <- which(colnames(all_dfmat)%in%other)

docvars(all_dfmat)$business <- apply(all_dfmat[,business_ind], FUN=sum, MARGIN=1)
docvars(all_dfmat)$employees <- apply(all_dfmat[,employees_ind], FUN=sum, MARGIN=1)
docvars(all_dfmat)$consumers <- apply(all_dfmat[,consumers_ind], FUN=sum, MARGIN=1)
docvars(all_dfmat)$employees_lim <- apply(all_dfmat[,employees_lim_ind], FUN=sum, MARGIN=1)
docvars(all_dfmat)$other <- apply(all_dfmat[,other], FUN=sum, MARGIN=1)


plot(aggregate(docvars(all_dfmat)$employees_lim, by=list(as.numeric(docvars(all_dfmat)$docvar5)), FUN=mean), type="l")

metadata_addl <- data.frame(other=docvars(all_dfmat)$other,
                            business=docvars(all_dfmat)$business, employees=docvars(all_dfmat)$employees, consumers=docvars(all_dfmat)$consumers, employees_lim=docvars(all_dfmat)$employees_lim)

metadata <- data.frame(metadata, metadata_addl )


#run the model again wiht these controls: 

all_stm_groups <- stm(all_dfmat, K=20, prevalence=~s(year,10)+gov+s(business, 3)+s(employees, 3)+s(consumers, 3), data=metadata, seed=123)

estd_groups <- estimateEffect(~log(business+1)+log(consumers+1)+log(employees_lim+1), all_stm_groups, metadata=metadata, uncertainty="Global", nsims=25)
estd_groups2 <- estimateEffect(~log(business+1)+log(consumers+1)+log(employees+1), all_stm_groups, metadata=metadata, uncertainty="Global", nsims=25)
estd_groups3 <- estimateEffect(~year+log(business+1)+log(consumers+1)+log(employees+1), all_stm_groups, metadata=metadata, uncertainty="Global", nsims=25)


summary(estd)
summary(estd_groups)
summary(estd_groups2)
summary(estd_groups3)
# plots for each group
par(mfrow=c(2, 5))
for (i in ec){
  plot.estimateEffect(estd_groups, covariate="employees_lim", method="continuous", topics=i, printlegend=F, main=labels_all_stm[i])
}

par(mfrow=c(2, 5))
for (i in ec){
  plot.estimateEffect(estd_groups, covariate="year", method="continuous", topics=i, printlegend=F, main=labels_all_stm[i])
}

plot.estimateEffect(estd_groups, covariate="consumers", topics=ec,
                    method="difference",
                    cov.value1 = 100, cov.value2 = 0, labeltype="custom", custom.labels = labels_all_stm[ec])

plot(metadata$year, log(metadata$employees_lim+1))
lines(lowess(metadata$year, log(metadata$employees_lim+1), f=1/3), col="red")

summary(lm(log(employees+1)~I(year/10), data=metadata))
summary(lm(log(employees_lim+1)~I(year/10), data=metadata))
summary(lm(log(business+1)~I(year/10), data=metadata))
summary(lm(log(consumers+1)~I(year/10), data=metadata))



plot( metadata$year, log(metadata$employees_lim+1), col="white", ylim=c(1.7, 3.6), xlab="Year", ylab="Ln(mentions)")
r1 <- lowess(log(metadata$employees_lim+1)~metadata$year)
lines(r1, col = "orange")
r2 <- lowess(log(metadata$business+1)~metadata$year)
lines(r2, col = "blue")
r3 <- lowess(log(metadata$consumers+1)~metadata$year)
lines(r3, col = "green")
r4 <- lowess(log(metadata$employees+1)~metadata$year)
lines(r4, col = "red")
text(locator(), labels = c("Consumers", "Business", "Employees", "Employees (lim)"))



legend(1981, 3.5, legend=c("Employees (lim)", "Employees", "Consumers", "Business"),
       col=c("yellow", "red", "green", "blue"), lty=1, cex=1)


r5 <- lm(log(other+1)~year+I(year^2), data=metadata)
lines(metadata$year, predict(r5), col = "black")


# Appendix 2.8  - Validation exercise
# Draw docs for validation

set.seed(1234)
drw <- sample(1:dim(all_dfmat)[1], 110)

docvars(all_dfmat)$docvar6[drw]

all_stm_dtprop <- make.dt(all_stm)

sort(all_stm_dtprop[drw[110], ])

for (i in 1:110){
print(i)  
print(sort(all_stm_dtprop[drw[i],], decreasing=T))
}

sink("valid_texts.txt")
for (i in 1:110){
  print(i)
  print(docvars(all_dfmat)$docvar6[drw[i]])
  print(str_squish(substr(all_cmd_notr$text[drw[i]], 1,5000)))
  cat("\n")
}
sink()



# Cabinet papers replication

require(quanteda)
require(quanteda.corpora)
require(quanteda.textmodels)
library(readtext)
require(stm)

# 
# data_cab <- readtext("data text.csv",text_field = "text")
# data_cab <- data_cab[-which(data_cab$text==""),]
# data_cab <- corpus(data_cab)
# 
# toks_cab <- tokens(data_cab,
#                remove_numbers=TRUE,
#                remove_punct=TRUE,
#                remove_symbols=TRUE,
#                remove_separators=TRUE,
#                remove_url = TRUE,
#                verbose=TRUE)
# 
# toks_cab <- tokens_tolower(toks_cab)
# 
# toks_cab <- tokens_remove(toks_cab, pattern = stopwords('en'), min_nchar=3)
# 
# 
# dfmat <- dfm(toks_cab)
# 
# dfmat <- dfm_trim(dfmat, min_docfreq = 0.01, max_docfreq = 0.75, docfreq_type = "prop", verbose=TRUE)
# 
# notadopted <- which(docvars(dfmat)$adopted==0)
# 
# dfmat <- dfmat[-notadopted,]
# 
# dfmat <- dfmat[ntoken(dfmat) > 0,]

# Main results:
stm2c20 <- stm(dfmat, K=20, prevalence=~s(date, 10), seed=123)

summary(stm2c20)

# Identify the economic topics
# ect <- c(3, 8, 10, 12, 13, 14, 15, 16, 17, 18)


# Vector of names

# topicnames3 <- c("Unemployment", "SOEs, privatization", "Economic prospects", "Public sector pay", "Health", 
#                  "Public spending", "Strikes", "Public sector reform", "Economic statistics", "Social insurance" 
# )


# Estimate marginal effects
estd_cab <- estimateEffect(~s(datenum, 10), stm2c20, metadata=docvars(dfmat))

# Effects for table 5:
estd_cab_1 <- estimateEffect(~I((datenum/365)/100), stm2c20, metadata=docvars(dfmat))

# # Continuous plots - Figure 4

# par(mfrow=c(2, 5))
# j=1
# for (i in ect){
#   plot.estimateEffect(estd_cab, covariate="datenum", method="continuous", topics=i, printlegend=F, main=topicnames3[j], xaxt="n")
#   axis(1, at=c(4748, 10196 ), labels=c("1983", "1997"))
#   j=j+1
# }



