Projects I have been working on
DetGen - Generating controllable traffic micro-structures for model probing
Model validation through careful model probing of particular importance in Cyber-Security. DetGen is a tool we build to produce traffic with controllable characteristics such as the specific conducted activity, transmitted data length, or experienced congestion in order to attach ground truth labels describing the traffic’s computation origin. We use this tool to examine where machine-learning-based intrusion detection models fail to classify traffic correctly and which traffic micro-structures cause this failing.


Some use-cases I examined and am currently publishing examine the inability of a LSTM-traffic-classifier to cope with excessive retransmission sequences, or the coherence of encoder-based traffic projection methods.


DetGen is available for researchers to use on GitHub.
Bidirectional LSTMs for access attack detection
In this project, we want to explore how much we can leverage common sequential network flow structures to create a model that detects low-volume access attacks. The underlying idea is that flow sequences corresponding to individual actions such as web-browsing follow very strong and repetitive structures, such as small HTTP-flows normally being followed by larger HTTP-flows.


For this, we built a bidirectional LSTM-network with ≈ 10.000 parameters that learns a language model of flow sequences, with flows acting as word tokens according to their destination port, direction, and size. Our consideration is that these structures can help detect low-volume attacks such as Heartbleed or SQL-injections which deviate from these structures by exploiting vulnerabilities.

You can find a corresponding implementation here:
# RNN definition | |
class LSTM_deep(nn.Module): | |
def __init__(self, n_categories, seq_input_size,seq_size_input_size, batch_size, hidden_size): | |
super(LSTM_deep, self).__init__() | |
dropout = 0.2 | |
self.num_layers=num_layers=2 | |
self.bidirectional = bidirectional = False | |
embed_dim1=10 | |
embed_dim2=5 | |
embed_dim3=3 | |
self.hidden_size = hidden_size | |
self.hidden_size2 = hidden_size2 = hidden_size | |
self.cn_size = hidden_size | |
#self.batch_size = batch_size | |
self.seq_input_size = seq_input_size | |
self.n_categories = n_categories | |
############################ | |
# would be better to use nn.embed | |
self.embed_state=nn.Linear(seq_input_size, embed_dim1) | |
self.embed_size=nn.Linear(seq_size_input_size, embed_dim2) | |
self.embed_cat=nn.Linear(n_categories, embed_dim3) | |
############################ | |
self.i2h = nn.LSTM(input_size=embed_dim1 + embed_dim2 + embed_dim3, | |
hidden_size=hidden_size, | |
num_layers=num_layers, dropout=dropout, bidirectional=bidirectional) | |
self.o2o = nn.Sequential(nn.Linear(hidden_size, hidden_size2), nn.ReLU(inplace=True)) | |
self.o2o2 = nn.Linear(hidden_size2, seq_input_size) | |
self.size_o2o2 = nn.Linear(hidden_size2, seq_size_input_size) | |
self.dropout = nn.Dropout(dropout) | |
self.softmax = nn.LogSoftmax() | |
def forward(self, category, seq_input,seq_size_input, hidden, cn, new_batch_size): | |
seq_input_combined_LSTM = torch.cat((self.embed_cat(category.view(new_batch_size,1,self.n_categories)), | |
self.embed_state(seq_input), | |
self.embed_size(seq_size_input)),2).transpose(0,1) | |
outputLSTM, (hidden, cn) = self.i2h(seq_input_combined_LSTM, (hidden, cn)) | |
outputLSTM=outputLSTM.view(new_batch_size,self.hidden_size) | |
self.dropout(outputLSTM) | |
output = self.o2o(outputLSTM) | |
state_output = self.o2o2(output) | |
size_output = self.o2o2(output) | |
state_output = self.softmax(state_output) | |
size_output = self.softmax(size_output) | |
return output,size_output, hidden, cn | |
def initHidden(self, new_batch_size): | |
return Variable(torch.zeros(self.num_layers, new_batch_size, self.hidden_size)) | |
def initcn(self, new_batch_size): | |
return Variable(torch.zeros(self.num_layers, new_batch_size, self.cn_size)) | |
def train_epoch(train_loader, model, criterion, optimizer, epoch, normal): | |
""" | |
Train for one epoch | |
""" | |
model.train() | |
print('inside train') | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(train_loader): | |
if (i<10): | |
print('i='+str(i)) | |
if (i<100)&(i%10==0): | |
print('i='+str(i)) | |
if (i<1000)&(i%100==0): | |
print('i='+str(i)) | |
if i%1000==0: | |
print('i='+str(i)) | |
# we will need to convert them to Variable to use gradients. | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
############## LSTM addition ############## | |
cn = model.initcn(new_batch_size) | |
############################################# | |
model.zero_grad() | |
loss = 0 | |
for flow in range(seq_input.size()[0]): | |
############## LSTM addition ############## | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
############################################# | |
#loss+=criterion(state_output, target[flow]) | |
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow]) | |
loss.backward() | |
optimizer.step() | |
def validate_epoch(val_loader, model, criterion, epoch, normal): | |
losses = AverageMeter() | |
# switch to evaluation mode so that the Dropout doesn't drop neurons | |
model.eval() | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(val_loader): | |
if (i<10): | |
print('i={}'.format(i)) | |
if (i<100)&(i%10==0): | |
print('i={}'.format(i)) | |
if (i<1000)&(i%100==0): | |
print('i={}'.format(i)) | |
if i%1000==0: | |
print('i={}'.format(i)) | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
############## LSTM addition ############## | |
cn = model.initcn(new_batch_size) | |
############################################# | |
model.zero_grad() | |
loss = 0 | |
for flow in range(seq_input.size()[0]): | |
############## LSTM addition ############## | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
############################################# | |
loss+=criterion(state_output, target[flow])+criterion(size_output, size_target[flow]) | |
print(loss.item()) | |
losses.update(loss.item(), seq_input.size()[0]) | |
with open('validate_log.txt', 'a') as f: | |
f.write('Epoch: [{0}][{1}/{2}]\t' | |
'loss: {loss}\n'.format(epoch, i, len(val_loader),loss=loss.item())) | |
return(losses.avg) | |
def predict(df,df_seq_input,categories,alphabet,size_alphabet,dataname,n_hidden=200): | |
""" | |
This function loads the trained rnn and returns prediction scores for normal hosts and infected hosts in test set | |
It also returns the cumulative averages of the scores | |
""" | |
print('prediction starts') | |
n_categories = len(categories) | |
batch_size = 1 | |
#n_hidden=200 | |
global alphabett | |
alphabett = alphabet | |
global size_alphabett | |
size_alphabett = size_alphabet | |
checkpoint = torch.load(PATH_MODEL+dataname+'_normal_model_best_long.pth.tar') | |
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet), | |
batch_size=batch_size, hidden_size=n_hidden) | |
model.load_state_dict(checkpoint['state_dict']) | |
pred_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==3], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
model.eval() | |
output_stats = {'SrcAddr':[], 'session_id':[], 'mean':[], 'median':[], | |
'probs':[],'size_probs':[], 'sesslen':[], 'probsmax':[], 'Ports':[], 'Portsmax':[], | |
'size_probsmax':[], 'Sizes':[], 'Sizemax':[], | |
'meanmax':[], 'medianmax':[]} | |
row_indicator=[x for x in range(batch_size)] | |
for i, (category, seq_input, seq_size_input, lengths, target, size_target, SrcAddr, session_id, | |
new_batch_size) in enumerate(pred_loader): | |
if (i<10): | |
print('i={}'.format(i)) | |
if (i<100)&(i%10==0): | |
print('i={}'.format(i)) | |
if (i<1000)&(i%100==0): | |
print('i={}'.format(i)) | |
if i%1000==0: | |
print('i={}'.format(i)) | |
probs = [] | |
probsmax = [] | |
Ports = [] | |
Portsmax = ['-'] | |
#################################################################### | |
size_probs = [] | |
size_probsmax = [] | |
Sizes = [] | |
Sizesmax = ['-'] | |
#################################################################### | |
category = Variable(category) | |
seq_input = Variable(seq_input.transpose(0,1)) | |
seq_size_input = Variable(seq_size_input.transpose(0,1)) | |
target = Variable(target.transpose(0,1)) | |
size_target = Variable(size_target.transpose(0,1)) | |
hidden = model.initHidden(new_batch_size) | |
cn = model.initcn(new_batch_size) | |
for flow in range(seq_input.size()[0]): | |
state_output, size_output, hidden, cn = model(category, seq_input[flow],seq_size_input[flow], hidden, cn, new_batch_size) | |
value, index = seq_input[flow].squeeze().max(0) | |
size_value, size_index = seq_size_input[flow].squeeze().max(0) | |
Port=alphabett[index.item()] | |
prob_next_flow = np.array(list(state_output.data[row_indicator,target.data.numpy()[flow]])) | |
value, index = state_output.data[row_indicator,].squeeze().max(0) | |
probmax = np.exp(value.item()) | |
#print(index.item()) | |
#print(len(alphabett)) | |
Portmax = alphabett[index.item()] | |
######################################################################################## | |
Size=size_alphabett[size_index.item()] | |
size_prob_next_flow = np.array(list(size_output.data[row_indicator,size_target.data.numpy()[flow]])) | |
size_value, size_index = state_output.data[row_indicator,].squeeze().max(0) | |
size_probmax = np.exp(size_value.item()) | |
Sizemax = size_alphabett[size_index.item()] | |
######################################################################################## | |
probs.extend(list(1-np.exp(prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax | |
probsmax.append(1-size_probmax) | |
Ports.append(Port) | |
Portsmax.append(Portmax) | |
######################################################################################## | |
size_probs.extend(list(1-np.exp(size_prob_next_flow))) # we do exp because the last layer of RNN is LogSoftmax | |
size_probsmax.append(1-probmax) | |
Sizes.append(Size) | |
Sizesmax.append([Sizemax]) | |
Ports.append('EOS') | |
output_stats['mean'].append(np.mean(np.array(probs))) | |
output_stats['median'].append(np.median(np.array(probs))) | |
output_stats['SrcAddr'].append(SrcAddr[0]) | |
output_stats['session_id'].append(session_id[0]) | |
output_stats['probs'].append(str(probs)) | |
output_stats['probsmax'].append(str(probsmax)) | |
output_stats['size_probs'].append(str(size_probs)) | |
output_stats['size_probsmax'].append(str(size_probmax)) | |
output_stats['meanmax'].append(np.mean(np.array(probsmax))) | |
output_stats['medianmax'].append(np.median(np.array(probsmax))) | |
output_stats['Ports'].append(str(Ports)) | |
output_stats['Sizes'].append(str(Sizes)) | |
output_stats['Portsmax'].append(str(Portsmax)) | |
output_stats['Sizemax'].append(str(Sizesmax)) | |
output_stats['sesslen'].append(seq_input.size()[0]) | |
df_output = pd.DataFrame(data=output_stats) | |
df_output = df_output.sort_values(['SrcAddr', 'session_id']) | |
# for each session, we will also check the previous session to compute cumulative average scores | |
df_output2 = df_output.assign(sum_means = df_output.groupby('SrcAddr')['mean'].cumsum()) | |
df_output2 = df_output2.assign(count_previous = df_output2.groupby('SrcAddr')['mean'].cumcount()+1) | |
df_output2 = df_output2.assign(sum_medians = df_output2.groupby('SrcAddr')['median'].cumsum()) | |
df_output2 = df_output2.assign(avg_previous_means = lambda x: x['sum_means']/x['count_previous']) | |
df_output2 = df_output2.assign(avg_previous_medians = lambda x: x['sum_medians']/x['count_previous']) | |
df_output2.to_csv(dataname+'_output_test.csv', index=False) | |
return df_output2 | |
def train(df_seq_input,categories,alphabet,size_alphabet,dataname,epochs=250, base_lr=0.0003, n_hidden=200,batch_size=25): | |
# training hyperparameters. These will probably need to be tuned. | |
global alphabett | |
alphabett = alphabet | |
global size_alphabett | |
size_alphabett=size_alphabet | |
print('Starting main') | |
losses=[] | |
lr = base_lr | |
#n_hidden=200 | |
weight_decay = 5e-4 | |
momentum = 0.9 | |
lr_freq_adj = 50 | |
normal=True | |
n_categories = len(categories) | |
best_loss = 1000000 | |
#in_log.txt we define model | |
model = LSTM_deep(n_categories, seq_input_size=len(alphabet), seq_size_input_size=len(size_alphabet), | |
batch_size=batch_size, hidden_size=n_hidden) | |
# we define loss function | |
criterion = nn.NLLLoss() # we define Negatve Loglikelihood lost because We are doing a multinomial classification. | |
train_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==1], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
# test loader | |
test_loader = data.DataLoader(session_loader(df_seq_input[df_seq_input['scenario']==2], alphabet,size_alphabet, categories), | |
batch_size=batch_size, shuffle=True, collate_fn=PadSequence()) | |
# optimizer: Stochastic Gradient Descent | |
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum) | |
for epoch in range(0, epochs): | |
print('epoch={}'.format(epoch)) | |
# adjust learning rate. Divide it by 2 every 10 epochs | |
lr = base_lr*(0.5**(epoch//lr_freq_adj)) | |
for param_group in optimizer.state_dict()['param_groups']: | |
param_group['lr']=lr | |
# train for one epoch | |
train_epoch(train_loader, model, criterion, optimizer, epoch, normal) | |
print("train epoch ended, calculate validation loss") | |
val_loss = validate_epoch(test_loader, model, criterion, epoch, normal) | |
print("validation loss calculated, choose if best") | |
print("Best loss:"+str(best_loss)+", val loss:"+str(val_loss)) | |
is_best = val_loss < best_loss | |
best_loss = min(val_loss, best_loss) | |
losses.append(val_loss) | |
print("Best chosen, save checkpoint") | |
save_checkpoint({'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_loss':best_loss}, is_best, normal,dataname) | |
print("Checkpoint saved") | |
print("Losses:"+str(losses)) | |
dflosses=pd.DataFrame(losses) | |
dflosses.to_csv(dataname+'_losses.csv', index=False) | |
################################################################################################### | |
################################################################################################### | |
################################################################################################### | |
def pad_sequence(sequences, batch_first=False, padding_value=20): | |
max_size = sequences[0].size() | |
trailing_dims = max_size[1:] | |
max_len = max([s.size(0) for s in sequences]) | |
out_dims = (len(sequences), max_len) + trailing_dims | |
out_tensor = Variable(sequences[0]).data.new(*out_dims).fill_(0) | |
for i, tensor in enumerate(sequences): | |
length = tensor.size(0) | |
# use index notation to prevent duplicate references to the tensor | |
out_tensor[i, :length, ...] = tensor | |
if length<max_len: | |
out_tensor[i, length:,0,padding_value]=1 | |
return out_tensor | |
class PadSequence: | |
def __call__(self, batch): | |
# Let's assume that each element in "batch" is a tuple (data, label). | |
# Sort the batch in the descending order | |
sorted_batch = sorted(batch, key=lambda x: x[2].shape[0], reverse=True) | |
new_batch_size=len(batch) | |
sequences = [x[1] for x in sorted_batch] | |
size_sequences = [x[2] for x in sorted_batch] | |
padding_value=len(alphabett)-1 | |
size_padding_value=len(size_alphabett)-1 | |
sequences_padded = pad_sequence(sequences, batch_first=True,padding_value=padding_value) | |
size_sequences_padded = pad_sequence(size_sequences, batch_first=True,padding_value=size_padding_value) | |
# Also need to store the length of each sequence | |
# This is later needed in order to unpad the sequences | |
lengths = torch.LongTensor([len(x) for x in sequences]) | |
# Don't forget to grab the labels of the *sorted* batch | |
labelss=[x[3] for x in sorted_batch] | |
size_labelss=[x[4] for x in sorted_batch] | |
out_dim = max([x.size(0) for x in labelss]) | |
cats = torch.stack([x[0] for x in sorted_batch]).squeeze(1) | |
targets=torch.LongTensor(len(labelss),out_dim)*0+padding_value | |
size_targets=torch.LongTensor(len(size_labelss),out_dim)*0+size_padding_value | |
for i in range(len(labelss)): | |
outter=labelss[i].size(0) | |
targets[i,0:outter]=labelss[i].view(1,-1) | |
size_targets[i,0:outter]=size_labelss[i].view(1,-1) | |
SrcAddrs=[x[5] for x in sorted_batch] | |
session_ids=[x[6] for x in sorted_batch] | |
return (cats, sequences_padded,size_sequences_padded, lengths, targets, size_targets, SrcAddrs, session_ids, new_batch_size) | |
CNN for stepping-stone flow correlation
I implemented a deep convolutional Neural Network, inspired by DeepCorr, to correlate connections in a relayed attack. Results were initially very promising, but unfortunately not robust against evasive chaff perturbations, and I have not found a way to overcome this issue. This lead me to perform and publish an evaluation of several stepping-stone detection methods in the presence of chaff, of which none showed sufficient robustness.

You can find the PyTorch implementation here:
Created on Fri Aug 30 11:11:27 2019 | |
@author: henry | |
""" | |
import torch | |
import torch.nn as nn | |
import pandas as pd | |
import numpy as np | |
from torch import optim | |
#import time | |
import random | |
print(torch.cuda.is_available()) | |
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") | |
print("device") | |
class ConvNet(nn.Module): | |
def __init__(self, l, k1, w1, k2, w2, poolw1, poolw2, Lin1, | |
Lin2, Lin3, pad1, pad2, dropout): | |
super(ConvNet, self).__init__() | |
self.k1=k1 | |
self.k2=k2 | |
self.w1=w1 | |
self.w2=w2 | |
self.l=l | |
self.pad1=pad1 | |
self.pad2=pad2 | |
self.poolw1=poolw1 | |
self.poolw2=poolw2 | |
self.Lin1=Lin1 | |
self.Lin2=Lin2 | |
self.Lin3=Lin3 | |
#input dim: batch_sizex1x2xl | |
self.convtime1 = nn.Conv2d(1, self.k1, (2,self.w1), stride=(2,1), padding=(0,self.pad1)) | |
self.convtime2 = nn.Conv2d(1, self.k1, (2,self.w1), stride=(2,1), padding=(0,self.pad1)) | |
self.convsize1 = nn.Conv2d(1, self.k1, (2,self.w1), stride=(2,1), padding=(0,self.pad1)) | |
self.convsize2 = nn.Conv2d(1, self.k1, (2,self.w1), stride=(2,1), padding=(0,self.pad1)) | |
self.relu=nn.ReLU() | |
self.convtime1_bn = nn.BatchNorm2d(self.k1) | |
self.convtime2_bn = nn.BatchNorm2d(self.k1) | |
self.convsize1_bn = nn.BatchNorm2d(self.k1) | |
self.convsize2_bn = nn.BatchNorm2d(self.k1) | |
self.dropout = nn.Dropout(dropout) | |
#output dim: batch_sizex1x1x(l-w1+1+pad) | |
self.pool1 = nn.MaxPool2d((1,self.poolw1), stride=(1,1)) | |
#output dim: batch_sizex1x1x(l-w1-poolw1+1+pad) | |
self.convcomb = nn.Conv2d(self.k1, self.k2, (4,self.w2), stride=(4,1), padding=(0,self.pad2)) | |
self.convcomb_bn = nn.BatchNorm2d(self.k2) | |
#output dim: batch_sizex1x1x(l-w1-poolw1+1+pad) | |
self.pool2 = nn.MaxPool2d((1,self.poolw2), stride=(1,1)) | |
self.fc1 = nn.Linear(self.k2*(self.l-self.w1+1+2*self.pad1- | |
self.poolw1+1-self.w2+1+2*self.pad2- | |
self.poolw2+1), self.Lin1) | |
self.fc2 = nn.Linear(self.Lin1, self.Lin2) | |
self.fc3 = nn.Linear(self.Lin2, self.Lin3) | |
self.fc1_bn = nn.BatchNorm1d(self.Lin1) | |
self.fc2_bn = nn.BatchNorm1d(self.Lin2) | |
self.fc3_bn = nn.BatchNorm1d(self.Lin3) | |
self.fcoutput = nn.Linear(self.Lin3,1) | |
# self.sigmoid = nn.LogSigmoid() | |
def forward(self, input1, input2, input3, input4): | |
#input dim: batch_sizex1x2xl | |
batchsize=input1.shape[0] | |
timefeatures1 = self.dropout(self.convtime1_bn(self.relu(self.convtime1(input1)))) | |
timefeatures2 = self.dropout(self.convtime2_bn(self.relu(self.convtime2(input2)))) | |
sizefeatures1 = self.dropout(self.convsize1_bn(self.relu(self.convsize1(input3)))) | |
sizefeatures2 = self.dropout(self.convsize2_bn(self.relu(self.convsize2(input4)))) | |
#output dim: batch_sizexk1x1x(l-w1+1+2*pad1) | |
timefeatures1 = self.pool1(timefeatures1) | |
timefeatures2 = self.pool1(timefeatures2) | |
sizefeatures1 = self.pool1(sizefeatures1) | |
sizefeatures2 = self.pool1(sizefeatures2) | |
#output dim: batch_sizexk1x1x(l-w1+1+2*pad1-poolw1+1) | |
combfeatures = torch.cat([timefeatures1,timefeatures2, | |
sizefeatures1,sizefeatures2], dim=2) | |
#output dim: batch_sizexk1x4x(l-w1+1+2*pad1-poolw1+1) | |
# combfeatures = self.relu(self.convcomb(combfeatures)) | |
combfeatures = self.convcomb_bn(self.relu(self.convcomb(combfeatures))) | |
#output dim: batch_sizexk2x1x(l-w1+1+2*pad1-poolw1+1-w2+1+2*pad2) | |
poolcombfeatures = self.pool2(self.dropout(combfeatures)) | |
#output dim: batch_sizexk2x1x(l-w1+1+2*pad1-poolw1+1-w2+1+2*pad2-poolw2+1) | |
poolcombfeatures = poolcombfeatures.squeeze(2).view(batchsize,-1) | |
Linfeatures1 = self.dropout(self.fc1_bn(self.relu(self.fc1(poolcombfeatures)))) | |
Linfeatures2 = self.dropout(self.fc2_bn(self.relu(self.fc2(Linfeatures1)))) | |
Linfeatures3 = self.dropout(self.fc3_bn(self.relu(self.fc3(Linfeatures2)))) | |
Output = self.fcoutput(Linfeatures3) | |
return Output | |
def trainBatch(input1, input2, input3, input4, labels, | |
DeepCorr, DeepCorr_optimizer, criterion, train=True): | |
# batch_size = inputs1.size(0) | |
DeepCorr_output = DeepCorr(input1.float(), input2.float(), | |
input3.float(), input4.float()) | |
loss = criterion(DeepCorr_output.squeeze(), labels) | |
##################################################### | |
##################################################### | |
if train==True: | |
DeepCorr_optimizer.zero_grad() | |
loss.backward() | |
DeepCorr_optimizer.step() | |
return DeepCorr_output, loss.item() | |
def trainIters(DeepCorr, inputs, batch_size, n_packets=300,train=True, | |
learning_rate=0.01, weight_decay=5e-4,Printer=True): | |
#start = time.time() | |
loss_total = 0 | |
all_losses = [] | |
outputs=[] | |
########################################################## | |
n_data=inputs.shape[0] | |
if batch_size==0: | |
batch_size=n_data | |
n_iters = int(np.floor(n_data/batch_size)) | |
input_indices1=[x for x in range(0,2*n_packets)] | |
input_indices2=[x for x in range(2*n_packets,4*n_packets)] | |
input_indices3=[x for x in range(4*n_packets,6*n_packets)] | |
input_indices4=[x for x in range(6*n_packets,8*n_packets)] | |
labels_index = 8*n_packets | |
########################################################## | |
if train==True: | |
DeepCorr.train() | |
else: | |
DeepCorr.eval() | |
iter_indexes=[x*batch_size for x in range(0,n_iters+1)] | |
if iter_indexes[n_iters]!=n_data: | |
n_iters+=1 | |
iter_indexes.append(n_data) | |
DeepCorr_optimizer = optim.Adam(DeepCorr.parameters(), lr=learning_rate, weight_decay=weight_decay) | |
criterion = nn.BCEWithLogitsLoss() | |
######################################################### | |
printtoken=10 | |
for iteration in range(1, n_iters + 1): | |
if (iteration%printtoken==0): | |
if Printer==True: | |
print('iteration='+str(iteration)) | |
if iteration==(printtoken*10): | |
printtoken=printtoken*10 | |
if train==True: | |
input_samples = random.sample(range(0, n_data), int(n_iters*batch_size)) | |
batch_index = input_samples[((iteration-1)*batch_size):(iteration*batch_size)] | |
else: | |
batch_index = [x for x in range(iter_indexes[iteration-1],iter_indexes[iteration])] | |
batch_size=len(batch_index) | |
input1 = torch.tensor(inputs.iloc[batch_index,input_indices1].values,device=device).view(batch_size,1,2,-1) | |
input2 = torch.tensor(inputs.iloc[batch_index,input_indices2].values,device=device).view(batch_size,1,2,-1) | |
input3 = torch.tensor(inputs.iloc[batch_index,input_indices3].values,device=device).view(batch_size,1,2,-1) | |
input4 = torch.tensor(inputs.iloc[batch_index,input_indices4].values,device=device).view(batch_size,1,2,-1) | |
labels = torch.tensor(inputs.iloc[batch_index,labels_index].values,dtype=torch.float32, device=device) | |
DeepCorr_output, loss = trainBatch(input1, input2, input3, input4, labels, | |
DeepCorr, DeepCorr_optimizer, criterion, train) | |
outputs.extend(DeepCorr_output.flatten().tolist()) | |
all_losses.append(loss) | |
loss=loss/n_iters | |
loss_total+=loss | |
all_losses.append(loss*n_iters) | |
return outputs, loss_total, all_losses | |
def trainEpochs(DeepCorr, inputs, batch_size, name='DeepCorrnet', epochs=250, learning_rate=0.01, | |
tr_split=0.7, val_split=1.0, weight_decay=5e-4): | |
dataname=name | |
#weight_decay = 5e-4 | |
lr_freq_adj = 35 | |
best_loss = 1000000000 | |
train_losses=[] | |
val_losses=[] | |
all_train_losses=[] | |
all_val_losses=[] | |
############################################ | |
#Get training inputs | |
n_data=inputs.shape[0] | |
input_samples = random.sample(range(0, n_data), int(val_split*n_data)) | |
train_samples=input_samples[0:int(np.floor((n_data)*tr_split))] | |
val_samples=input_samples[int(np.floor((n_data)*tr_split)):int(np.floor((n_data)*val_split))] | |
train_inputs=inputs.iloc[train_samples,:].reset_index(drop=True) | |
val_inputs=inputs.iloc[val_samples,:].reset_index(drop=True) | |
############################################ | |
printtoken=10 | |
for epoch in range(0, epochs): | |
Printer=False | |
if (epoch%printtoken==0): | |
print('epoch='+str(epoch)) | |
#Printer=True | |
if epoch==(printtoken*10): | |
printtoken=printtoken*10 | |
lr = learning_rate*(0.5**(epoch//lr_freq_adj)) | |
outputs, train_loss, all_train_loss = trainIters(DeepCorr, train_inputs, batch_size, train=True, | |
learning_rate=lr, weight_decay=weight_decay, | |
Printer=Printer) | |
train_losses.append(train_loss) | |
all_train_losses.extend(all_train_loss) | |
if (epoch%printtoken==0): | |
print("train epoch ended, calculate validation loss") | |
outputs, val_loss, all_val_loss = trainIters(DeepCorr, val_inputs, batch_size, train=False, | |
learning_rate=lr, weight_decay=weight_decay, | |
Printer=Printer) | |
if (epoch%printtoken==0): | |
print("validation loss calculated, choose if best") | |
print("Best loss:"+str(best_loss)+", val loss:"+str(val_loss)) | |
if val_loss < best_loss: | |
torch.save({'epoch': epoch,'model_state_dict': DeepCorr.state_dict(), | |
#'optimizer_state_dict': encoder_optimizer.state_dict(), | |
'loss': val_loss}, dataname+'DeepCorr.tar') | |
best_loss = min(val_loss, best_loss) | |
val_losses.append(val_loss) | |
all_val_losses.extend(all_val_loss) | |
# save_checkpoint({'epoch':epoch+1, 'state_dict':model.state_dict(), 'best_loss':best_loss}, is_best, normal,dataname) | |
dftrainlosses=pd.DataFrame(train_losses) | |
dfvallosses=pd.DataFrame(val_losses) | |
dftrainlosses.to_csv(dataname+'_train_losses.csv', index=False) | |
dfvallosses.to_csv(dataname+'_val_losses.csv', index=False) | |
dftrainlosses=pd.DataFrame(all_train_losses) | |
dfvallosses=pd.DataFrame(all_val_losses) | |
dftrainlosses.to_csv(dataname+'_all_train_losses.csv', index=False) | |
dfvallosses.to_csv(dataname+'_all_val_losses.csv', index=False) | |
######################################################################################### | |
######################################################################################### | |
######################################################################################### | |
if __name__=='__main__': | |
#inputs=pd.read_csv('DeepCorr_Data/stepping_stone_pairs.csv') | |
inputs_SSH=pd.read_csv('DeepCorr_Data/stepping_stone_pairs.csv') | |
columns=inputs_SSH.columns | |
# inputs_Noise_50=pd.read_csv('DeepCorr_Data/noise_50_Conv.txt',header=None) | |
# inputs_Noise_51=pd.read_csv('DeepCorr_Data/noise_51_Conv.txt',header=None) | |
# inputs_Noise_52=pd.read_csv('DeepCorr_Data/noise_52_Conv.txt',header=None) | |
# inputs_Noise_53=pd.read_csv('DeepCorr_Data/noise_53_Conv.txt',header=None) | |
inputs_Noise_52=pd.read_csv('DeepCorr_Data/NoNoise/noise52_relay_stepstone-2019-11-29_18-47-14-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_53=pd.read_csv('DeepCorr_Data/NoNoise/noise53_relay_stepstone-2019-11-29_18-48-26-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_54=pd.read_csv('DeepCorr_Data/NoNoise/noise54_relay_stepstone-2019-11-29_18-50-23-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_55=pd.read_csv('DeepCorr_Data/NoNoise/noise55_relay_stepstone-2019-11-29_18-53-03-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_56=pd.read_csv('DeepCorr_Data/NoNoise/noise56_relay_stepstone-2019-11-29_18-57-49-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_57=pd.read_csv('DeepCorr_Data/NoNoise/noise57_relay_stepstone-2019-11-29_18-56-49-sc1-1.csv_Conv.txt',header=None) | |
inputs_Noise_52.columns=columns | |
inputs_Noise_53.columns=columns | |
inputs_Noise_54.columns=columns | |
inputs_Noise_55.columns=columns | |
inputs_Noise_56.columns=columns | |
inputs_Noise_57.columns=columns | |
inputs=pd.concat([inputs_Noise_52,inputs_Noise_53,inputs_Noise_54,inputs_Noise_55,inputs_Noise_56,inputs_Noise_57], | |
ignore_index=True) |
Network traffic activity modelling
For my Master’s thesis, I developed a fast and scalable Bayesian model that extends traditional Markov modulated Poisson processes to capture the non-Poisson-like bursty nature of network traffic. This model identifies different levels of user network activity automatically, which can be used for detecting abnormal usage patterns.


A fast C++-based implementation for R can be found on my GitHub, and on CRAN.