# Twitter Data - Code ### Packages ### install.packages("xlsx") # Run Packages library(xlsx) # tweet 1 # Rizoiu et al - replication tweet=c(0,21,31,33,49,54,58,62,82,87,87.5,89,100,100.5,101,102,104,113,117,128, 134,148,149,177,202,202.5,236,252,283,288,292,296,300,346,351,352,419, 432,443,449,452,488,590) # first 600 seconds of retweet activity (43 cases) n=length(tweet) y=c(1:n) plot(tweet, y, type="l", main="Retweet Instances for first 600 seconds", ylab="Retweets", xlab="Time (seconds)") # alpha beta lambda MLE <- nlm(loglik, c(1,1,1), hessian = TRUE, arrivals = tweet) MLE paste( c("alpha", "beta", "lambda"), round(MLE$estimate,2), sep=" = ") # usual answer: "alpha = 0.03" "beta = 0.07" "lambda = 0.04" # Rizoiu's estimates: {alpha = 0.0003,β = 1.0156, lambda = 0.0054} # this test was done by Rizoiu et al as a marked HP, different results alphaest<-MLE$estimate[1] betaest<-MLE$estimate[2] lambdaest<-MLE$estimate[3] # Branching Ratio Branch=alphaest/betaest Branch # tweetrep <- simulate_hawkes(alphaest, betaest, lambdaest, n) y=c(1:n) plot(tweet, y, type="l", main="Actual vs Simulated Retweet Cascade (first 600 seconds)", ylab="Retweets", xlab="Time (seconds)") lines(tweetrep, y, col="blue") legend("topleft",legend=c("Actual Retweet Cascade","Simulated Retweet Cascade"), text.col=c("black","blue"),pch=c(16,15),col=c("black","blue")) # I simulated the retweet cascade using the parameters the function estimated and plotted them together # with only 43 instnces it's hard for the algo to get a good replication, # so my replication was not brilliantly sucessful for the case in the paper with the first 600 seconds # Tweet 2 # I will replicate this process using more than 600 seconds (almost using the full data set of retweets) tweettime <- read.delim("tweettime.txt", header = FALSE) tweet <- tweettime$V1 tweet # 219 values # remove values over 3500 seconds remove <- c(3500:250000) tweet<-tweet[!tweet %in% remove] tweet # reduced to 160 values n=length(tweet) n y=c(1:n) plot(tweet, y, type="l", main="Retweet Instances (full data set)", ylab="Retweets", xlab="Time (seconds)") tweet<-tweet/100 # to make it easier on the nlm function to find reproducable values # alpha beta lambda MLE <- nlm(loglik, c(2,2,2), hessian = TRUE, arrivals = tweet) MLE paste( c("alpha", "beta", "lambda"), round(MLE$estimate,2), sep=" = ") alphaest<-MLE$estimate[1] betaest<-MLE$estimate[2] lambdaest<-MLE$estimate[3] #lambdaest = 0.01 # Branching Ratio Branch=alphaest/betaest Branch #multiple simulations tweetrep <- multsim(alphaest, betaest, lambdaest, n) y=c(1:n) tweet1<-tweet*100 tweetrep<-tweetrep*100 plot(tweet1, y, type="l", main="Actual vs Simulated Retweet Cascade (full data set)", ylab="Retweets", xlab="Time (seconds)") lines(tweetrep, y, col="blue") legend("topleft",legend=c("Actual Retweet Cascade","Simulated Retweet Cascade"), text.col=c("black","blue"),pch=c(16,15),col=c("black","blue")) # for the full data set this hasnt worked at all, this thing cant account for the tweet fading into the abiss of the internet # before, this MLE just gave back the same results as teh 600 second one, but now it gives very high exp # params and very low lambda to make up for the lack of arrivals for most of the process, this is probably # why Rizoiu only did the first 600 seconds. # i then divided the values of the arrival times by 100 to make it easier for the nlm function to find replicable values # this result was a bit better and gave an okay graph