-
Notifications
You must be signed in to change notification settings - Fork 0
/
sorter.py
67 lines (59 loc) · 2.01 KB
/
sorter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import csv
from urllib.parse import urlparse
"""
Save 1 CSV with all results,
remove description field,
break URL down up to 3 levels deep
fields:
page title
page link
level 1 (ie. research)
level 2 (ie. centers and institutes)
level 3 (ie. some center name)
save to new csv
"""
# application
if __name__ == "__main__":
# open/create files
old_file = './coa.csv'
new_file = './coa_sorted.csv'
# write headers to new file
with open(new_file, "a") as output_file:
fieldnames = ['Page Title', 'Page Link', 'Level 1', 'Level 2', 'Level 3']
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writeheader()
# open old file
with open(old_file, newline='') as csv_file:
reader = csv.DictReader(csv_file)
# loop through records
for row in reader:
title = row['Page Title']
link = row['Page Link']
# strip the path out of the url
parse = urlparse(row['Page Link'])
path = parse.path
# split url: foo/bar => foo bar
segments = path.split('/')
# this is so inefficient but I am tired
# try to assign segments, assign null if not
try:
level_1 = segments[1]
except IndexError:
level_1 = 'null'
try:
level_2 = segments[2]
except IndexError:
level_2 = 'null'
try:
level_3 = segments[3]
except IndexError:
level_3 = 'null'
# write the new row to the new file
with open(new_file, "a") as output_file:
writer = csv.DictWriter(output_file, fieldnames=fieldnames)
writer.writerow({'Page Title': title,
'Page Link': link,
'Level 1': level_1,
'Level 2': level_2,
'Level 3': level_3,
})