Research Article

Distance Measurement Methods for Improved Insider Threat Detection

Algorithm 6

HMM full evaluation code.
library(HMM)
library(readr)
library(stringdist)
usernames <- c()
scenarios <- c()
filenames <- c()
hmmResults <- c()
files <-
list.files(
path = "~/answers",
pattern = "*.csv",
full.names = T,
recursive = TRUE
)
for (f  in files)
username <- regmatches(f, regexpr("-[A-Za-z0-9]+∖ ∖.", f))
username <- sub("-", "", username)
username <- sub("∖∖.", "", username)
usernames <- c(usernames, username)
scenario <- regmatches(f, regexpr("-[1-3]-", f))
scenario <- sub("-", "", scenario)
scenario <- sub("-", "", scenario)
scenarios <- c(scenarios, scenario)
filenames <- c(filenames, f)
for (i  in  1:length(usernames))
answerFile <-
read_csv(filenames[i],
col_names = FALSE,
col_types = cols_only(X3 = col_guess()))
answerFileX3 <-
as.POSIXct(answerFileX3, format = "%m/%d/%Y %H:%M:%S", tz = "UTC")
user <-
cert_r4_2_dataset[cert_r4_2_datasetuser == usernames[i], ]
m <-
match(answerFileX3, userdate) #match answer file dates to user dates
week <-
userweek[m[1:length(m)]] #week in which the attack ACTUALLY occurred
allWeeks <-
split(cert_r4_2_dataset[cert_r4_2_datasetuser %in% usernames[i], ]activity,
cert_r4_2_dataset[cert_r4_2_datasetuser %in% usernames[i], ]week)
#Filter dataset to only include data relevent to chosen user.
indx <-
sapply(allWeeks, length) #Convert the allWeeks variable into DataFrame.
res <-
as.data.frame(do.call(cbind, lapply(allWeeks, length<-, max(indx))))
############## HMM Phase ########################
hmm = initHMM(c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), c(1, 2, 3, 4, 5, 6, 7))
#Initiate a 10 state HMM with 7 labels (which represent activities of user.)
model = baumWelch(
hmm,
na.omit(unlist(res[1:5])),
maxIterations = 20,
pseudoCount = 0.1,
delta = 0.01
) #Train our model with the first 5 weeks of user activity.
vector = c()
for (i  in 6:length(res))
#For the remaining weeks of activity...
#What is the probability of a given observed sequence with respect to our model?
logForwardProbabilities = forward(modelhmm, na.omit(unlist(res[i]))) #... calculate the
probability of week i occurring against model...
like <- ((logForwardProbabilities))
lenthOfLike <- (length(like) / 10)
answer <- sum(like[, lenthOfLike])
vector[i - 5] <-
answer #... store result of probability in vector...
print(i) #Prints the current week to so we can see progress of computations in the console.
model = baumWelch(
modelhmm,
na.omit(unlist(res[1:i])),
maxIterations = 20,
pseudoCount = 0.1,
delta = 0.01
) #... and update model with week i.
########## Find Lowest Probability #############
probability = 0
for (result in vector)
if ((result) < probability)
probability = result
hmmWeek = match(probability, vector) + 5 #Find week which lowest probability occurred in. Offet is
+5 since our results start at week 6.
if (hmmWeek %in% week)
hmmResults <- c(hmmResults, hmmWeek)
else
hmmResults <- c(hmmResults, "FALSE")
fullResults <-
data.frame(usernames, scenarios, filenames, hmmResults)