-
Notifications
You must be signed in to change notification settings - Fork 0
/
xiViewExporter_msannika.py
193 lines (164 loc) · 7.5 KB
/
xiViewExporter_msannika.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#!/usr/bin/env python3
# Exporter of MS Annika Crosslink Results to xiVIEW Format
# 2022 (c) Micha Johannes Birklbauer
# https://github.com/michabirklbauer/
# micha.birklbauer@gmail.com
import argparse
import pandas as pd
from typing import List
__version = "1.0.0"
__date = "20221020"
"""
DESCRIPTION:
A script to export MS Annika results to xiVIEW input files (CSV + FASTA).
USAGE:
xiViewExporter_msannika.py f [f ...]
[-fasta FASTA]
[-ignore IGNORE]
[-o OUTPUT]
[-h]
[--version]
positional arguments:
f MS Annika crosslink result files in Microsoft Excel
format (.xlsx) to process.
required arguments:
-fasta FASTAFILE, --fasta FASTAFILE
Fasta file used for crosslink search. Must contain
proteins identified in the MS Annika result files.
optional arguments:
-ignore ACCESSION, --ignore ACCESSION
Protein accessions to be ignored. Crosslinks that only
link between ignored proteins will not be exported.
Supports input of multiple accessions.
-h, --help show this help message and exit
-o OUTPUT, --output OUTPUT
Prefix of the output files.
--version show program's version number and exit
"""
# Exporter class with constructor that takes one or multiple excel Crosslink
# result files as input, plus the fasta database and potentiol proteins to
# ignore
class MSAnnika_Exporter:
def __init__(self, input_files: List[str], fasta_file: str, ignore_list: List[str]):
self.input_files = input_files
self.ignore_list = ignore_list
# read fasta file
sequences = dict()
with open(fasta_file, "r", encoding = "utf-8") as f:
fasta_data = f.read()
f.close()
for entry in fasta_data.split(">"):
lines = entry.split("\n")
if lines[0].strip() != "":
description = lines[0].strip()
identifier = lines[0].strip().split("|")[1].strip()
sequence = "".join([x.strip() for x in lines[1:]])
if identifier not in sequences:
sequences[identifier] = {"description": description, "sequence": sequence}
else:
print("WARNING: Identifier " + identifier + " is not unique!")
self.database = sequences
def __clean_sequence(self, sequence: str) -> str:
return sequence.replace("[", "").replace("]", "").strip().upper()
def __get_xl_position(self, sequence: str) -> int:
pos = 1
for AA in sequence:
if AA == "[":
return pos
else:
pos += 1
def __get_proteins(self, accessions: str) -> str:
accessions_list = accessions.split(";")
accessions_output = ""
for accession in accessions_list:
if accession not in self.ignore_list:
accessions_output = accessions_output + accession + ";"
return accessions_output.rstrip(";")
def __get_peptide_positions(self, sequence: str, identifiers_str: str) -> str:
identifiers = identifiers_str.split(";")
peptide_positions = ""
for identifier in identifiers:
protein = self.database[identifier]["sequence"]
pos_in_protein = protein.find(self.__clean_sequence(sequence))
peptide_positions = peptide_positions + str(pos_in_protein + 1) + ";"
return peptide_positions.rstrip(";")
def __get_xl_position_in_protein(self, sequence: str, identifiers_str: str) -> str:
identifiers = identifiers_str.split(";")
protein_positions = ""
for identifier in identifiers:
protein = self.database[identifier]["sequence"]
pep_pos_in_protein = protein.find(self.__clean_sequence(sequence))
xl_pos_in_pep = self.__get_xl_position(sequence)
# we don't have to add the +1 for one-based position here
# since pep_pos_in_protein + xl_pos_in_pep = xl_pos_in_protein + 1
protein_positions = protein_positions + str(pep_pos_in_protein + xl_pos_in_pep) + ";"
return protein_positions.rstrip(";")
def __generate_csv_df(self) -> pd.DataFrame:
# columns
AbsPos1 = []
AbsPos2 = []
Protein1 = []
Protein2 = []
Decoy1 = []
Decoy2 = []
Score = []
for input_file in self.input_files:
df = pd.read_excel(input_file)
for i, row in df.iterrows():
if self.__get_proteins(row["Accession A"]) != "" and self.__get_proteins(row["Accession B"]) != "":
AbsPos1.append(self.__get_xl_position_in_protein(row["Sequence A"], self.__get_proteins(row["Accession A"])))
AbsPos2.append(self.__get_xl_position_in_protein(row["Sequence B"], self.__get_proteins(row["Accession B"])))
Protein1.append(self.__get_proteins(row["Accession A"]))
Protein2.append(self.__get_proteins(row["Accession B"]))
Score.append(row["Best CSM Score"])
result = pd.DataFrame({"AbsPos1": AbsPos1, "AbsPos2": AbsPos2, "Protein1": Protein1, "Protein2": Protein2, "Score": Score})
return result
def __generate_fasta_str(self) -> str:
fasta_str = ""
for key in self.database:
fasta_str = fasta_str + ">" + self.database[key]["description"] + "\n" + self.database[key]["sequence"] + "\n"
return fasta_str
# export function, takes one argument "output_file" which sets the prefix
# of generated output files
def export(self, output_file = None, format = "xiVIEW") -> None:
csv = self.__generate_csv_df()
fasta = self.__generate_fasta_str()
if output_file == None:
output_file = self.input_files[0].split(".")[0]
csv.to_csv(output_file + ".csv", index = False)
with open(output_file + ".fasta", "w", encoding = "utf-8") as f:
f.write(fasta)
f.close()
# initialize exporter and export xiVIEW files
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument(metavar = "f",
dest = "files",
help = "Name/Path of the MS Annika result files to process.",
type = str,
nargs = "+")
parser.add_argument("-fasta", "--fasta",
dest = "fasta",
default = None,
help = "Name/Path of the fasta file.",
required = True,
type = str)
parser.add_argument("-ignore", "--ignore",
dest = "ignore",
default = [],
help = "Protein identifiers to ignore.",
type = str,
nargs = "*")
parser.add_argument("-o", "--output",
dest = "output",
default = None,
help = "Name of the output files.",
type = str)
parser.add_argument("--version",
action = "version",
version = __version)
args = parser.parse_args()
exporter = MSAnnika_Exporter(args.files, args.fasta, args.ignore)
exporter.export(args.output)
if __name__ == "__main__":
main()