-
Notifications
You must be signed in to change notification settings - Fork 2
/
extract-cvefixes-data.py
executable file
·106 lines (91 loc) · 3.97 KB
/
extract-cvefixes-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/bin/env python3
#
# Collect data from CVEfixes SQLite database, which can be built from
# code @ https://github.com/secureIT-project/CVEfixes and
# data @ https://zenodo.org/record/4476564
#
#------------------------------------------------------------------------
import pandas as pd
import sqlite3 as lite
from sqlite3 import Error
from pathlib import Path
from datetime import date
from collections import defaultdict
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)
#------------------------------------------------------------------------
# User settable parameters
#------------------------------------------------------------------------
DATA_PATH = Path("../repos/CVEfixes/Data") # Directory containing CVEfixes.db
#------------------------------------------------------------------------
# Database connection code for CVEfixes SQLite database
#------------------------------------------------------------------------
def create_connection(db_file):
conn = None
try:
conn = lite.connect(db_file, timeout=10) # connection via sqlite3
except Error as e:
print(e)
return conn
conn = create_connection(DATA_PATH / "CVEfixes.db")
#------------------------------------------------------------------------
# Filter out files that don't contain source code
#------------------------------------------------------------------------
def filter_files(in_files):
filtered_files = []
for file in in_files:
if re.match(r'(readme|changelog|install|makefile|makefile.pl)$', file, re.IGNORECASE):
pass
elif file.endswith('.html') or file.endswith('.md') or file.endswith('.txt'):
pass
else:
filtered_files.append(file)
# pp.pprint(filtered_files)
return(filtered_files)
#------------------------------------------------------------------------
# Build cves dictionary from CVEfixes database
#------------------------------------------------------------------------
def file_has_method(file_change_id):
file_exists = pd.read_sql_query("SELECT name FROM method_change WHERE file_change_id = '{}'".format(file_change_id), conn)
if file_exists is None:
return False
else:
return True
#------------------------------------------------------------------------
# Build cves dictionary from CVEfixes database
#------------------------------------------------------------------------
cves = defaultdict(list)
fixes_df = pd.read_sql_query("SELECT cve_id, hash FROM fixes", conn)
for row in fixes_df.itertuples():
cves[row.cve_id] = [ row.hash ]
commit_info = pd.read_sql_query("SELECT committer_date, repo_url FROM commits WHERE hash = '{}'".format(row.hash), conn)
for commitrow in commit_info.itertuples():
cves[row.cve_id].append(commitrow.repo_url)
cves[row.cve_id].append(commitrow.committer_date)
files = []
files_df = pd.read_sql_query("SELECT file_change_id,new_path FROM file_change WHERE hash = '{}'".format(row.hash), conn)
for filerow in files_df.itertuples():
files.append(filerow.new_path)
out_files = filter_files(files)
if out_files:
file_string = ",".join(out_files)
else:
file_string = ""
cves[row.cve_id].append(file_string)
cve_df = pd.read_sql_query("SELECT cve_id, published_date FROM cve", conn)
for row in cve_df.itertuples():
cves[row.cve_id].append(row.published_date)
#------------------------------------------------------------------------
# Build cves dictionary from CVEfixes database
#------------------------------------------------------------------------
for cve in cves:
data = cves[cve]
if len(data) >= 4:
files = data[3].split(",")
# Print only CVEs with one file
if len(files) == 1 and len(files[0]) > 0:
print("{};{};{};{};{};{}".format(cve,data[0], data[1], files[0], data[2], data[4]))
# FIXME: print all CVEs regardless of how many files
# for file in files:
# print("{};{};{};{};{}".format(cve, data[0], file, data[1], data[3]))