-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_Uniprot_parser.py
358 lines (323 loc) · 17.1 KB
/
1_Uniprot_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
#!/usr/bin/python
# manojmw
# 04 Dec, 2021
import re
import argparse
import logging
import sys
###########################################################
# Parses on STDIN a Uniprot file and extracts the following fields from each record:
# - Primary Accession and Secondary Accession(s) from the 'AC' line
# - Gene Name and synonyms from the 'GN' line
# - Taxonomy Identifier from the 'OX' line
# - ENST(s), ENSG(s) & GeneID(s) from the 'DR' line
#
# Prints to STDOUT in .tsv format
# Ouput consists of 7 columns (one record per line):
# - Uniprot Primary Accession
# - Taxonomy Identifier
# - ENST (or a comma seperated list of ENSTs)
# - ENSG (or a comma seperated list of ENSGs)
# - Uniprot Secondary Accession (or a comma seperated list of Uniprot Secondary Accessions)
# - GeneID (or a comma seperated list of GeneIDs)
# - Gene Name (or a comma seperated list of Gene Names)
def uniprot_parser(UniProtinFile):
logging.info("Starting to run...")
UniProtinFile = sys.stdin
# Header line
UniProt_header = ['Primary_AC', 'TaxID', 'ENSTs', 'ENSGs', 'Secondary_ACs', 'NCBI_GeneID', 'GeneName', 'Alt_GeneID', 'Function']
print('\t'.join(UniProt_header))
# Initializing variables/accumulators
ACs = ''
TaxID = 0
ENSTs = []
ENSGs = []
GeneIDs = []
GeneNames = []
Alt_GeneID = ''
Function = ''
# Compiling all the regular expressions
# Accession Numbers (AC), strip trailing ';'
re_AC = re.compile('^AC\s+(\S.*);$')
# Gene Name/Synonyms from the GN line
# the below regex pattern ends with either ; or , because
# UniProt has a bug (at the time of writing this script) -
# the lines for some records are incomplete and does not
# end with semi-colon as it is supposed to and even unclosed curly braces
# Ex:
# GN Name=aadK {ECO:0000303|PubMed:17609790, ECO:0000303|PubMed:8293959,
# Sometimes, GN lines can also start just with 'Synonyms'
# Ex:
# GN Synonyms=AO {ECO:0000303|PubMed:27255930}
re_GN = re.compile('^GN\s+((Name=\S.*)|(Synonyms=\S.*))[,;]$')
# Organism (TaxID) from the OX line; some lines violate the uniprot spec
# Grab only TaxID even if additional info comes after the TaxID
# eg NCBI_TaxID=32201 {ECO:0000312|EMBL:ABW86978.1};
re_TaxID = re.compile('^OX\s+NCBI_TaxID=(\d+)[; ]')
# Ensembl transcripts and Genes from the DR line
re_ENS = re.compile('^DR\s+Ensembl; (\w+).*; \S+; (\w+).*\.')
# GeneIDs from the DR line
re_GID = re.compile('^DR\s+GeneID;\s+(\d+);')
# HGNC ID from the DR line
re_HGNC_ID = re.compile('^DR\s+HGNC;\s(HGNC:\d+);')
# Function from the CC line
re_Function = re.compile('^CC\s+-!- FUNCTION:\s+(.+)')
# inFunction == True; when we are in a CC-FUNCTION block,
# False otherwise; inFunction == False
inFunction = False
# Data lines
for line in UniProtinFile:
line = line.rstrip('\r\n') # removing trailing new lines and carriage returns
if (inFunction) and ((re.match(r'^CC\s+-!-', line)) or (re.match(r'^CC\s+---', line)) or (re.match(r'^(^CC\s/)', line))):
# we were in a CC-function block but we're not in it anymore,
# update boolean flag and then still process the line
inFunction = False
# Matching and retrieving the records
if (re_AC.match(line)):
if (ACs != ''):
# Add trailing separator to the previous entries - distingushes each AC
ACs += '; '
ACs += re_AC.match(line).group(1)
elif (re.match(r'^AC\s', line)):
# If any AC line is missed, Exit the program with an error message
logging.error("Missed the AC line: \n" + line)
sys.exit()
elif (re_GN.match(line)):
GNLine = re_GN.match(line).group(1)
# As per the UniProt documentation, the GN
# line is supposed to have a defined pattern
# but that's not the case in reality
# So we grab everything from the GN line
# The GN line always ends with a ';' except in some cases where
# the records are incomplete and has a bug like below (ends with a ',')
# GN Name=aadK {ECO:0000303|PubMed:17609790, ECO:0000303|PubMed:8293959,
try:
# If the GN line contains any additional info other than
# Gene Name, it will also be seperated by a '; '
# Ex: GN Name=Jon99Cii; Synonyms=SER1, SER5, Ser99Da; ORFNames=CG7877;
GNList = GNLine.split('; ')
if GNList:
for geneinfo in GNList:
# Retrieving only the Gene Name
if re.match('^Name=(\S.*)', geneinfo):
GeneNameD = re.match('^Name=(\S.*)', geneinfo).group(1)
# Sometimes, Gene Name can contain additional info
# such as pubmed and other accession IDs
# So, we want eliminate these from our
# Gene name
# Ex: GN Name=dbaA {ECO:0000303|PubMed:23001671};
try:
# So we split at the ' {' and keep only the
# Gene Name
GeneNamewithAcID = GeneNameD.split(' {')
GeneName = GeneNamewithAcID[0]
except:
GeneName = GeneNameD
# There can be multiple 'GN' lines, especially
# when there are many synonyms
# So we check to avoid adding the same gene name again
# Ex:
# GN Name=Jon99Cii; Synonyms=SER1, SER5, Ser99Da; ORFNames=CG7877;
# GN Name=Jon99Ciii; Synonyms=SER2, SER5, Ser99Db; ORFNames=CG15519;
if not GeneName in GeneNames:
GeneNames.append(GeneName)
# retreiving synonyms for Gene Name (if it exists)
if re.match('^Synonyms=(\S.*)', geneinfo):
GeneSynoinfo = re.match('^Synonyms=(\S.*)', geneinfo).group(1)
GeneSynonyms = []
# There can be multiple synonyms seperated by a ','
# Ex:
# GN Name=Jon99Cii; Synonyms=SER1, SER5, Ser99Da;
try:
# Splitting at ',' can sometimes add additional
# info of synonym to the list as well
# Especially, when more than one additional info
# exists for a given Gene synonym and these are also
# Seperated by a ','. We eliminate this later while
# adding synonyms to the Gene Names list
#
# Gene name Synonym with one additional info
# Ex: GN Synonyms=ALI2 {ECO:0000250|UniProtKB:Q80ZD8}
#
# Gene name Synonym with more than one additional info
# seperated by a ','
# Ex: GN Synonyms=OPCL1 {ECO:0000303|PubMed:18267944, ECO:0000303|PubMed:19704801};
GeneSynonymList = GeneSynoinfo.split(', ')
if GeneSynonymList:
# Like Gene Name, even Gene synonyms can
# can contain additional info such as pubmed and other accession IDs
# So, we want eliminate these and keep only the synonym name
# Ex:
# GN Name=Sh3bp5; Synonyms=Sab {ECO:0000303|PubMed:10339589};
for synonym in GeneSynonymList:
try:
Gsynowithaddinfo = synonym.split(' {')
Gsynonym = Gsynowithaddinfo[0]
GeneSynonyms.append(Gsynonym)
except:
GeneSynonyms.append(synonym)
except:
GeneSynonyms.append(GeneSynoinfo)
# Avoid adding the same synonym again especially
# when they occur on multiple 'GN' lines
for synonym in GeneSynonyms:
# Eliminating additional info (i.e not a synonym)
if not ':' in synonym:
if not synonym in GeneNames:
GeneNames.append(synonym)
except:
# if the GN line contains only the Gene name, we do not split
# Ex: GN Name=APOM;
if re.match('^Name=(\S.*)', GNLine):
GeneNameD = re.match('^Name=(\S+)', GNLine).group(1)
try:
# When Gene Name contains additional info
GeneNamewithAcID = GeneNameD.split(' {')
GeneName = GeneNamewithAcID[0]
except:
GeneName = GeneNameD
if not GeneName in GeneNames:
GeneNames.append(GeneName)
elif (re.match(r'^GN\s+Name.*', line)):
logging.error("Error: Missed the Gene Name \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
elif (re.match(r'^GN\s+Synonyms.*', line)):
logging.error("Error: Missed the Gene Name Synonym \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
elif (re_TaxID.match(line)):
if (TaxID != 0):
logging.error("Several OX lines for the protein: \t" + "Protein: " + ACs + "\n" + line)
sys.exit()
TaxID = re_TaxID.match(line).group(1)
elif (re.match(r'^OX\s',line)):
logging.error("Missed the OX line \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
elif (re_ENS.match(line)):
ENS_match = re_ENS.match(line)
ENST = ENS_match.group(1)
if ENST not in ENSTs:
ENSTs.append(ENST)
ENSG = ENS_match.group(2)
if ENSG not in ENSGs:
ENSGs.append(ENSG)
elif (re.match(r'^DR\s+Ensembl;', line)):
logging.error("Failed to get all the Ensembl Identifiers \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
elif (re_GID.match(line)):
GeneIDs.append(re_GID.match(line).group(1))
elif (re.match(r'^DR\s+GeneID.*', line)):
logging.error("Missed the GeneIDs \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
elif (re_HGNC_ID.match(line)):
Alt_GeneID = re_HGNC_ID.match(line).group(1)
elif (re.match(r'^DR\s+HGNC', line)):
logging.error("Found Additional HGNC Gene ID lines \n" + "Protein: " + ACs + "\n" + line)
sys.exit()
# The below elif code block is useful for finding orthologs at later stage
# The code written here is useful for finding an Yeast ortholog (TaxID = 559292)
# By default, it is commented out; You can modify code as per your needs
# Note: The model organism Gene ID should match with Gene ID used by
# Alliance of Genome Resource Database (AGR)
# Also you should note below, an example Identifier used by AGR for yeast is SGD:S000004208
# In Uniprot the DR line is: DR SGD; S000005540; RTS1.
# For human proteins however, DR HGNC; HGNC:12279; XXX.
# Uniprot records are broken some times and do not follow the same pattern
# So, I am adding 'SGD:', You need to cross check yours
# elif (re.match(r'^DR\s+SGD;\s(\w+);',line)):
# Alt_GeneID = 'SGD:' + re.match(r'^DR\s+SGD;\s(\w+);',line).group(1)
# elif (re.match(r'^DR\s+SGD', line)):
# logging.error("Found Additional SGD Gene ID lines \n" + "Protein: " + ACs + "\n" + line)
# sys.exit()
elif(re_Function.match(line)):
inFunction = True # We are in a function block
Function += re_Function.match(line).group(1)
elif (inFunction):
if not (re.match(r'^CC\s+(.+)', line)):
logging.error('We are in CC-function block but line does not start with CC, impossible!' + "Protein: " + ACs + "\n" + line)
sys.exit()
Function += ' ' + re.match(r'^CC\s+(.+)', line).group(1)
# '//' means End of the record
# we Process the retreived data
elif (line == '//'):
# ignore entry if bad species; TaxID Human = 9606
if (TaxID == '9606'):
# Again, the below if block is useful only for fincding orthologs
# So, commented out by default; Here TaxID = '559292' is for Yeast
# You should replace it with the TaxID of the model organism for which
# you want an ortholog
# In case you intend on using this, then uncomment the below if block
# and comment the above (if (TaxID == '9606'): block
# if (TaxID == '9606') or (TaxID == '559292'):
try:
ACs_split = ACs.split('; ')
primary_AC = ACs_split[0] # Grab only the first AC
secondary_AC_list = ACs_split[1:] # Grab the remaining ACs
# storing secondary_ACs as a single comma-seperated string
secondary_ACs = ','.join(secondary_AC_list)
except:
logging.error("Failed to store store UniProt Accession IDs for the protein: \t" + "Protein: " + ACs + "\n")
sys.exit()
try:
# storing Gene names as a single comma-seperated string
GeneNames = ','.join(GeneNames)
except:
logging.error('Error: Failed to store Gene Name for the protein: \t' + "Protein: " + ACs + "\n")
sys.exit()
try:
# storing ENSTs and ENSGs as a single comma-seperated string
ENSTs = ','.join(ENSTs)
ENSGs = ','.join(ENSGs)
except:
logging.error("Failed to store Ensembl Identifiers for the protein: \t" + "Protein: " + ACs + "\n")
sys.exit()
try:
# storing GeneIDs as a single comma-seperated string
GeneIDs = ','.join(GeneIDs)
except:
logging.error('Error: Failed to store Gene Identifiers for the protein: \t' + "Protein: " + ACs + "\n")
sys.exit()
# Printing to STDOUT
UniProt_outline = [primary_AC, TaxID, ENSTs, ENSGs, secondary_ACs, GeneIDs, GeneNames, Alt_GeneID, Function]
print('\t'.join(UniProt_outline))
# Reset all accumulators and move on to the next record
ACs = ''
TaxID = 0
ENSTs = []
ENSGs = []
GeneIDs = []
GeneNames = []
Alt_GeneID = ''
Function = ''
continue
logging.info("All Done, completed successfully!")
return
###########################################################
# Taking and handling command-line arguments
def main():
file_parser = argparse.ArgumentParser(description =
"""
--------------------------------------------------------------------------------------------------
Program: Parses on STDIN a UniProt file, processes each record and prints to STDOUT in .tsv format
--------------------------------------------------------------------------------------------------
The output consists of 9 columns (tsv):
-> Uniprot Primary Accession
-> Taxonomy Identifier
-> ENST (or a comma seperated list of ENSTs)
-> ENSG (or a comma seperated list of ENSGs)
-> Uniprot Secondary Accession (or a comma seperated list of Uniprot Secondary Accessions)
-> GeneID (or a comma seperated list of GeneIDs)
-> Gene Name (or a comma seperated list of Gene Names)
-> HGNC ID
-> Function
--------------------------------------------------------------------------------------------------
Arguments [defaults] -> Can be abbreviated to shortest unambiguous prefixes
""",
formatter_class = argparse.RawDescriptionHelpFormatter)
file_parser.set_defaults(func=uniprot_parser)
args = file_parser.parse_args()
args.func(args)
if __name__ == "__main__":
# Logging to Standard Error
Log_Format = "%(levelname)s - %(asctime)s - %(message)s \n"
logging.basicConfig(stream = sys.stderr, format = Log_Format, level = logging.DEBUG)
main()