-
Notifications
You must be signed in to change notification settings - Fork 0
/
names_to_csv.py
49 lines (39 loc) · 1.69 KB
/
names_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import csv
import sys
from tqdm import tqdm
csv_file = open('names.csv', 'w', newline='')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['tax_id', 'scientific_name', 'common_name', 'genbank_common_name', 'synonym'])
record = {}
with open("names.dmp", "r") as f:
previous_tax_id = -1
for line in tqdm(f):
columns = line.strip().split("\t|\t")
tax_id = int(columns[0])
name_txt = columns[1]
unique_name = columns[2]
name_class = columns[3].replace('\t|','')
if tax_id == previous_tax_id:
record[name_class] = name_txt
else:
if 'tax_id' in record:
db_tax_id = record['tax_id']
db_scientific_name = record.get('scientific name', '')
db_common_name = record.get('common name', '')
db_genbank_common_name = record.get('genbank common name', '')
db_synonym = record.get('synonym', '')
csv_writer.writerow([db_tax_id, db_scientific_name, db_common_name, db_genbank_common_name, db_synonym])
record = {
'tax_id' : tax_id,
name_class : name_txt,
}
previous_tax_id = tax_id
# handle last record
if 'tax_id' in record:
db_tax_id = record['tax_id']
db_scientific_name = record.get('scientific name', '')
db_common_name = record.get('common name', '')
db_genbank_common_name = record.get('genbank common name', '')
db_synonym = record.get('synonym', '')
csv_writer.writerow([db_tax_id, db_scientific_name, db_common_name, db_genbank_common_name, db_synonym])
csv_file.close()