-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript.py
228 lines (180 loc) · 6.71 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import re
from sklearn.cluster import KMeans
from sklearn import preprocessing
def intertime(time_list):
num = len(time_list)
diff = []
#print time_list
for i in xrange(0, len(time_list) - 1):
tmp = time_list[i+1] - time_list[i]
diff.append(tmp)
avg_inter_time = sum(diff)/num
return avg_inter_time
def explore_graph(data):
num_sourceip = data['source_ip'].value_counts()
num_destinationip = data['destination_ip'].value_counts()
num_destports = data['destination_port'].value_counts()
num_sourceports = data['source_port'].value_counts()
num_num_packets = data['num_packets'].value_counts()
num_num_bytes = data['num_bytes'].value_counts()
plt.hist(num_sourceip, bins = 100, range = (0,1000), alpha = 0.5)
plt.hist(num_destinationip, bins = 100, range = (0, 1000), alpha = 0.5)
plt.show()
plt.hist(num_sourceports, bins = 100, range = (0, 10), alpha = 0.5)
plt.hist(num_destports, bins = 100, range = (0, 10), alpha = 0.5)
plt.show()
plt.hist(num_num_packets, bins = 100, alpha = 0.8, range = (0,10))
plt.hist(num_num_bytes, bins = 100, alpha = 0.8, range = (0,100))
plt.show()
def split_ip(data, ip_regex):
iprecv_last_two = []
for ip in data['destination_ip']:
temp_ip = re.findall(ip_regex, ip)
iprecv_last_two.append(temp_ip)
ip_recv_l1, ip_recv_l2, ip_recv_f1 = [], [], []
for ipa in iprecv_last_two:
ip_recv_f1.append(ipa[0][0])
ip_recv_l1.append(ipa[0][1])
ip_recv_l2.append(ipa[0][2])
ip_recv_l1 = pd.Series(ip_recv_l1)
ip_recv_l2 = pd.Series(ip_recv_l2)
ip_recv_f1 = pd.Series(ip_recv_f1)
#Concat ip addresses
data = pd.concat([data, ip_recv_l1, ip_recv_l2, ip_recv_f1], axis = 1)
data = data.rename( columns = {0: 'ip_l1', 1: 'ip_l2', 2: 'ip_f1'})
print "IP addresses have been split up in sub domain and machine numbers"
return data
def cleanup(data):
data = data.drop('destination_ip', axis = 1)
data = data.drop('flags', axis = 1)
data = data.drop('site', axis = 1)
data = data.drop('asn', axis = 1)
print 'Data cleaned'
return data
if __name__ == '__main__':
#Load data
file = 'ds_1_with_fields.csv'
data = pd.read_csv(file)
print "Data loaded"
#Uncomment to see exploratory histograms
#explore_graph(data)
#Split destination ip
ip_regex = r"^([0-9]{2,3}\.[a-zA-Z0-9]{5})\.([a-zA-Z0-9]{5})\.([a-zA-Z0-9]{1,3}$)"
data = split_ip(data, ip_regex)
#Remove columns of flags, site, asn, destination_ip
data = cleanup(data)
print "Data cleaning now", ".", "..", "......"
#Create a dataframe for every receiver, with columns source, number of attempts,
#average size of packet, average time difference between attempts, and avg. bytes
#Find number of sub regions in the data set
receivers = data.groupby('ip_f1').groups
i = 0
src, packets, bytes, st_time, stopper = [], [], [], [], []
for rec in receivers:
indexlist = receivers[rec]
stopper.append(i)
i = 0
for index in indexlist:
src_tmp = data['source_ip'].iloc[index]
packets_tmp = data['num_packets'].iloc[index]
bytes_tmp = data['num_bytes'].iloc[index]
st_time_tmp = data['start_time'].iloc[index]
src.append(src_tmp)
packets.append(packets_tmp)
bytes.append(bytes_tmp)
st_time.append(st_time_tmp)
i += 1
dataset = pd.DataFrame({'source_ip': src, 'num_packets': packets, 'num_bytes': bytes,
'start_time': st_time})
print "Data segregated for every destination IP"
#print dataset.head(20)
tmp_mov = 0
stopper_mov = []
num_recs = len(stopper)
print "For every destination IP, finding source IP that attempted to dock", '.', '..', '...'
print "For every source IP, calculating number of attempts, avg time size, and packet size"
for j in xrange(0, num_recs):
tmp_mov = sum(stopper[0:j+1])
stopper_mov.append(tmp_mov)
test, test2 = [], []
lprev = 1
s_packets, s_num_attempts, s_source, s_avg_packets, s_bytes, s_avg_bytes, time_lst, s_avg_intertime = [], [], [], [], [], [], [], []
for j in xrange(0, len(stopper_mov)-1):
split = stopper_mov[j]
split_u = stopper_mov[j+1]
#print 'split', split
#print 'split_u', split_u
recv_tmp = dataset.iloc[split:split_u, :]
recv_source = recv_tmp.groupby('source_ip').groups
#print 'recv:', recv_source
#Creating the feature matrix
for src, index_lst in recv_source.iteritems():
#Number of attempts by each receiver
#print 'source:', src
num_attempts_tmp = len(index_lst)
s_num_attempts.append(num_attempts_tmp)
s_source.append(src)
#test2.append(src)
for index in index_lst:
#Avg. number of packets by source
s_packets_tmp = dataset['num_packets'].iloc[index]
s_packets.append(s_packets_tmp)
#Avg. number of bytes by source
s_bytes_tmp = dataset['num_bytes'].iloc[index]
s_bytes.append(s_bytes_tmp)
#time intervals
time = data['start_time'].iloc[index]
time_lst.append(time)
#print time
#Count to find average
lprev = len(index_lst)
if (index == index_lst[len(index_lst) - 1]):
#packets
#test.append(index)
avg_packets = sum(s_packets)/lprev
s_avg_packets.append(avg_packets)
s_packets = []
#bytes
avg_bytes = sum(s_bytes)/lprev
s_avg_bytes.append(avg_bytes)
s_bytes = []
#Time
#print len(time_lst)
time_sorted = sorted(time_lst)
#print 'sorted', len(time_lst)
avg_intertime = intertime(time_sorted)
s_avg_intertime.append(avg_intertime)
time_lst = []
print 'Feature matrix created!'
print 'Number of sources:', len(s_source)
print 'Number of destinations:', num_recs
features = pd.DataFrame({'source_ip': s_source, 'num_attempts': s_num_attempts,
'avg_num_packets': s_avg_packets, 'avg_num_bytes': s_avg_bytes,
'avg_intertime': s_avg_intertime})
#Uncomment to see graphs of features
attempts = features['num_attempts'].values
#plt.hist(attempts, bins = 100, range = (0, 1000), alpha = 0.8)
#plt.show()
packets = features['avg_num_packets'].values
#plt.hist(attempts, bins = 100, range = (0, 200), alpha = 0.8)
#plt.show()
attempts = features['avg_intertime'].values
#plt.hist(attempts, bins = 100, range = (1000, 5000), alpha = 0.8)
#plt.show()
source_ip = features['source_ip'].values
features = features.drop('source_ip', axis = 1)
features = np.asarray(features, dtype = 'int')
features_scaled = preprocessing.scale(features)
print 'Data scaled (Z-transformed) and converted to numpy arrays for K-Means'
print 'Creating K-Means classifier to identify two clusters'
kmeans = KMeans(n_clusters = 2, n_init = 100)
print 'fitting KMeans classifier...'
clusters = kmeans.fit(features_scaled)
labels = clusters.labels_
results = pd.DataFrame({'source_ip': source_ip, 'labels': labels})
print 'Malicious IP addresses identified'
print results