# Twitter Data - Code

### Packages ###
install.packages("xlsx")
# Run Packages
library(xlsx)


# tweet 1
# Rizoiu et al - replication
tweet=c(0,21,31,33,49,54,58,62,82,87,87.5,89,100,100.5,101,102,104,113,117,128,
        134,148,149,177,202,202.5,236,252,283,288,292,296,300,346,351,352,419,
        432,443,449,452,488,590) # first 600 seconds of retweet activity (43 cases)
n=length(tweet)
y=c(1:n)
plot(tweet, y, type="l",  main="Retweet Instances for first 600 seconds", 
     ylab="Retweets", xlab="Time (seconds)")
# alpha beta lambda
MLE <- nlm(loglik, c(1,1,1), hessian = TRUE, arrivals = tweet) 
MLE
paste( c("alpha", "beta", "lambda"), round(MLE$estimate,2), sep=" = ")
# usual answer: "alpha = 0.03"  "beta = 0.07"   "lambda = 0.04"
# Rizoiu's estimates: {alpha = 0.0003,β = 1.0156, lambda = 0.0054}
# this test was done by Rizoiu et al as a marked HP, different results
alphaest<-MLE$estimate[1]
betaest<-MLE$estimate[2]
lambdaest<-MLE$estimate[3]
# Branching Ratio 
Branch=alphaest/betaest
Branch
#
tweetrep <- simulate_hawkes(alphaest, betaest, lambdaest, n)
y=c(1:n)
plot(tweet, y, type="l",  main="Actual vs Simulated Retweet Cascade (first 600 seconds)", 
     ylab="Retweets", xlab="Time (seconds)")
lines(tweetrep, y,  col="blue")
legend("topleft",legend=c("Actual Retweet Cascade","Simulated Retweet Cascade"),
       text.col=c("black","blue"),pch=c(16,15),col=c("black","blue"))
# I simulated the retweet cascade using the parameters the function estimated and plotted them together
# with only 43 instnces it's hard for the algo to get a good replication,
# so my replication was not brilliantly sucessful for the case in the paper with the first 600 seconds



# Tweet 2
# I will replicate this process using more than 600 seconds (almost using the full data set of retweets)
tweettime <- read.delim("tweettime.txt", header = FALSE)
tweet <- tweettime$V1
tweet # 219 values
# remove values over 3500 seconds
remove <- c(3500:250000)
tweet<-tweet[!tweet %in% remove]
tweet # reduced to 160 values
n=length(tweet)
n
y=c(1:n)
plot(tweet, y, type="l",  main="Retweet Instances (full data set)", 
     ylab="Retweets", xlab="Time (seconds)")
tweet<-tweet/100 # to make it easier on the nlm function to find reproducable values
# alpha beta lambda
MLE <- nlm(loglik, c(2,2,2), hessian = TRUE, arrivals = tweet)
MLE
paste( c("alpha", "beta", "lambda"), round(MLE$estimate,2), sep=" = ")
alphaest<-MLE$estimate[1]
betaest<-MLE$estimate[2]
lambdaest<-MLE$estimate[3]
#lambdaest = 0.01
# Branching Ratio
Branch=alphaest/betaest
Branch
#multiple simulations
tweetrep <- multsim(alphaest, betaest, lambdaest, n)
y=c(1:n)
tweet1<-tweet*100
tweetrep<-tweetrep*100
plot(tweet1, y, type="l",  main="Actual vs Simulated Retweet Cascade (full data set)", 
     ylab="Retweets", xlab="Time (seconds)")
lines(tweetrep, y,  col="blue")
legend("topleft",legend=c("Actual Retweet Cascade","Simulated Retweet Cascade"),
       text.col=c("black","blue"),pch=c(16,15),col=c("black","blue"))
# for the full data set this hasnt worked at all, this thing cant account for the tweet fading into the abiss of the internet
# before, this MLE just gave back the same results as teh 600 second one, but now it gives very high exp 
# params and very low lambda to make up for the lack of arrivals for most of the process, this is probably
# why Rizoiu only did the first 600 seconds.
# i then divided the values of the arrival times by 100 to make it easier for the nlm function to find replicable values
# this result was a bit better and gave an okay graph