-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
151 lines (120 loc) · 4.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pickle as pkl
import pandas as pd
import pefile as pe
import sys
import numpy as np
import pandas as pd
malwareData = pd.read_csv('data/MalwareData.csv', sep='|')
def check_md5(file:str) -> str | None:
""" Get the MD5 hash of the file
Args:
file (str): the path of the file
Returns:
str: the MD5 hash of the file
"""
file_data = pe.PE(file)
hash = file_data.get_imphash()
print(f"Hash: {hash}")
# Check if the hash is in the dataset
if hash in malwareData["md5"].values:
return hash
else:
return None
def get_entropy(file) -> float:
data = None
with open(file, "rb") as file:
data = file.read()
if data is None:
return 0.0
possible = dict(((chr(x), 0) for x in range(0, 256)))
for byte in data:
possible[chr(byte)] += 1
data_length = len(data)
entropy = 0.0
#compute
for i in possible:
if possible[i] == 0:
continue
else:
entropy -= (possible[i] / data_length) * (np.log2(possible[i] / data_length))
# print(f"Entropy: {entropy}")
return entropy
def get_resource_entropy(file_path):
""" Get the entropy of the resources in the file
Args:
file_path (str): the path of the file
Returns:
dict: the entropies of the resources
"""
pefile = pe.PE(file_path)
resource_entropies = []
if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
if hasattr(resource_type, 'directory'):
for resource_id in resource_type.directory.entries:
if hasattr(resource_id, 'directory'):
for resource_lang in resource_id.directory.entries:
data_rva = resource_lang.data.struct.OffsetToData
size = resource_lang.data.struct.Size
data = pe.get_data(data_rva, size)
entropy = get_entropy(data)
resource_entropies.append(entropy)
else:
resource_entropies.append(0)
return resource_entropies
def mean(data: list) -> float:
""" Calculate the mean of the data
Args:
data (list): the data to calculate the mean
Returns:
float: the mean of the data
"""
return sum(data) / len(data)
def extract_features_from_file(file:str) -> any:
file_data = pe.PE(file)
VersionInfo = None
try:
VersionInfo = len(file_data.VS_FIXEDFILEINFO)
except Exception as e:
VersionInfo = 0
features = {
"DllCharacteristics": file_data.OPTIONAL_HEADER.DllCharacteristics,
"Machine": file_data.FILE_HEADER.Machine,
"Characteristics": file_data.FILE_HEADER.Characteristics,
"Subsystem": file_data.OPTIONAL_HEADER.Subsystem,
"VersionInformationSize": VersionInfo,
"SectionsMaxEntropy": max([section.get_entropy() for section in file_data.sections], default=0),
"MajorSubsystemVersion": file_data.OPTIONAL_HEADER.MajorSubsystemVersion,
"SizeOfOptionalHeader": file_data.FILE_HEADER.SizeOfOptionalHeader,
"ResourcesMaxEntropy": get_resource_entropy(file)[-1] if get_resource_entropy(file) else 0,
"ImageBase": file_data.OPTIONAL_HEADER.ImageBase,
"ResourcesMinEntropy": get_resource_entropy(file)[0] if get_resource_entropy(file) else 0,
"MajorOperatingSystemVersion": file_data.OPTIONAL_HEADER.MajorOperatingSystemVersion,
"SizeOfStackReserve": file_data.OPTIONAL_HEADER.SizeOfStackReserve,
"SectionsMinEntropy": min([section.get_entropy() for section in file_data.sections], default=0),
"SectionsMeanEntropy": mean([section.get_entropy() for section in file_data.sections]),
#"ResourcesMinSize": min([section.SizeOfRawData for section in file_data.sections], default=0),
}
print(f"Features: {features}\nFeatures Length: {len(features)}")
return pd.DataFrame([features])
def predict(file:str) -> any:
features = extract_features_from_file(file)
model = pkl.load(open("data/trained_model.pkl", "rb"))
return model.predict(features)
def main():
if len(sys.argv) != 2:
print("Usage: python main.py <file>")
sys.exit(1)
file = sys.argv[1]
result = predict(file)
md5 = check_md5(file)
elif result[0] == 0 and md5 is None:
print(f"{file} is legitimate")
print("MD5 not found in the dataset")
elif result[0] == 0 and md5 is not None:
print(f"{file} is malicious")
print(f"MD5 found in the dataset: {md5}")
else:
print(f"{file} is malicious")
print(f"Predicted: {result[0]}%\nEntropy: {get_entropy(file)}")
main()