main.py

import pickle as pkl
import pandas as pd
import pefile as pe
import sys
import numpy as np
import pandas as pd

malwareData = pd.read_csv('data/MalwareData.csv', sep='|')

def check_md5(file:str) -> str | None:
	""" Get the MD5 hash of the file
	Args:
		file (str): the path of the file
	Returns:
		str: the MD5 hash of the file
	"""

	file_data = pe.PE(file)
	hash = file_data.get_imphash()
	print(f"Hash: {hash}")

	# Check if the hash is in the dataset
	if hash in malwareData["md5"].values:
		return hash
	else:
		return None

def get_entropy(file) -> float:
	data = None

	with open(file, "rb") as file:
		data = file.read()
		if data is None:
			return 0.0

	possible = dict(((chr(x), 0) for x in range(0, 256)))

	for byte in data:
		possible[chr(byte)] += 1
	
	data_length = len(data)
	entropy = 0.0
	
	#compute 
	for i in possible:
		if possible[i] == 0:
			continue
		else:
			entropy -= (possible[i] / data_length) * (np.log2(possible[i] / data_length))
	
	# print(f"Entropy: {entropy}")
	return entropy

def get_resource_entropy(file_path):
	""" Get the entropy of the resources in the file
	Args:
		file_path (str): the path of the file
	Returns:
		dict: the entropies of the resources
	"""

	pefile = pe.PE(file_path)
	resource_entropies = []

	if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
		for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
			if hasattr(resource_type, 'directory'):
				for resource_id in resource_type.directory.entries:
					if hasattr(resource_id, 'directory'):
						for resource_lang in resource_id.directory.entries:
							data_rva = resource_lang.data.struct.OffsetToData
							size = resource_lang.data.struct.Size
							data = pe.get_data(data_rva, size)
							entropy = get_entropy(data)
							resource_entropies.append(entropy)
	else:
		resource_entropies.append(0)
	return resource_entropies

def mean(data: list) -> float:
	""" Calculate the mean of the data
	Args:
		data (list): the data to calculate the mean
	Returns:
		float: the mean of the data
	"""

	return sum(data) / len(data)

def extract_features_from_file(file:str) -> any:
	file_data = pe.PE(file)
	VersionInfo = None

	try:
		VersionInfo = len(file_data.VS_FIXEDFILEINFO)
	except Exception as e:
		VersionInfo = 0
		
	features = {	
		"DllCharacteristics": file_data.OPTIONAL_HEADER.DllCharacteristics,
		"Machine": file_data.FILE_HEADER.Machine,
		"Characteristics": file_data.FILE_HEADER.Characteristics,
		"Subsystem": file_data.OPTIONAL_HEADER.Subsystem,
		"VersionInformationSize": VersionInfo,
		"SectionsMaxEntropy": max([section.get_entropy() for section in file_data.sections], default=0),
		"MajorSubsystemVersion": file_data.OPTIONAL_HEADER.MajorSubsystemVersion,
		"SizeOfOptionalHeader": file_data.FILE_HEADER.SizeOfOptionalHeader,
		"ResourcesMaxEntropy": get_resource_entropy(file)[-1] if get_resource_entropy(file) else 0,
		"ImageBase": file_data.OPTIONAL_HEADER.ImageBase,
		"ResourcesMinEntropy": get_resource_entropy(file)[0] if get_resource_entropy(file) else 0,		
		"MajorOperatingSystemVersion": file_data.OPTIONAL_HEADER.MajorOperatingSystemVersion,
		"SizeOfStackReserve": file_data.OPTIONAL_HEADER.SizeOfStackReserve,
		"SectionsMinEntropy": min([section.get_entropy() for section in file_data.sections], default=0),
		"SectionsMeanEntropy": mean([section.get_entropy() for section in file_data.sections]),
		#"ResourcesMinSize": min([section.SizeOfRawData for section in file_data.sections], default=0),
	}

	print(f"Features: {features}\nFeatures Length: {len(features)}")

	return pd.DataFrame([features])

def predict(file:str) -> any:
	features = extract_features_from_file(file)
	model = pkl.load(open("data/trained_model.pkl", "rb"))
	
	return model.predict(features)

def main():
	if len(sys.argv) != 2:
		print("Usage: python main.py <file>")
		sys.exit(1)
	
	file = sys.argv[1]
	result = predict(file)
	
	md5 = check_md5(file)

	elif result[0] == 0 and md5 is None:
		print(f"{file} is legitimate")
		print("MD5 not found in the dataset")
	
	elif result[0] == 0 and md5 is not None:
		print(f"{file} is malicious")
		print(f"MD5 found in the dataset: {md5}")
	
	else:
		print(f"{file} is malicious")
	
	print(f"Predicted: {result[0]}%\nEntropy: {get_entropy(file)}")

main()