-
Notifications
You must be signed in to change notification settings - Fork 0
/
canv.py
227 lines (185 loc) · 9.03 KB
/
canv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import xmltodict
import networkx as nx
import argparse
from webweb import Web
import webweb
def load_bib(path):
"""
Load bibliography from .bib file.
You can create the .bib file yourself or download it from Google Scholar.
Download from Google Scholar:
1. open your google scholar profile
2. click the box in the left of `TITLE` (below your avatar)
3. click 'EXPORT' -> 'BibTeX'
The author list must follow the Google Bib-style, i.e., starts with "author={" and
each author name is formatted as "Last_name, First_name".
"""
with open(path) as f:
data = f.read()
co_author_lst = list()
def traverse(ipt):
author_lst = list()
try:
s_idx = ipt.index('author={')
except ValueError:
return
e_idx = ipt.index('},', s_idx)
authors = ipt[s_idx+8:e_idx].split(' and ')
for author in authors:
last_name, first_name = author.split(', ')
author_lst.append(first_name + ' ' + last_name)
co_author_lst.append(author_lst)
traverse(ipt[e_idx:])
traverse(data)
return co_author_lst
def load_dblp_xml(path):
"""
Load DBLP author metadata.
First, load author publication xml file.
Download:
1. open your DBLP author page
2. the third icon right next your Name, hover and click the `XML` in the dropdown box
3. save your XML file
Then this function read your co-authorship records.
"""
with open(path) as f:
data = xmltodict.parse(f.read())
co_author_lst = list()
for paper in data['dblpperson']['r']:
author_lst = list()
# assumption: the first element of Ordered Dict 'paper' is paper type
paper_type = list(paper.keys())
if paper_type[0] in ['article', 'inproceedings']:
authors = paper[list(paper.keys())[0]]['author']
else:
continue
if len(authors) > 2:
for author in authors:
try:
name = author['#text']
except TypeError:
continue
if name[-4:].isnumeric():
name = name[:-5]
if '(-)' in name:
name = name.replace('(-)', '')
author_lst.append(name)
else:
continue
co_author_lst.append(author_lst)
# len(co-author_lst): number of paper
# each of 'co-author_lst): list of co-authors of one paper
return co_author_lst
def build_graph(co_author_lst, min_weight, frequent_co_authors=None, show_percentage_names=0):
# build graph
g = nx.Graph()
# add edges
for paper_author in co_author_lst:
for author in paper_author:
for another_author in paper_author:
if author != another_author:
try:
g[author][another_author]['weight'] += (.5 / (len(paper_author)-1))
except KeyError:
g.add_weighted_edges_from([[author, another_author, (.5 / (len(paper_author)-1))]])
else:
pass
# for an edge, if its weight (co-author times between two authors) less than 'min_weight', filter it out
edge_list = [[u, v, w['weight']] for (u, v, w) in g.edges(data=True) if w['weight'] >= min_weight]
nodes = set([u for (u, v, w) in edge_list]).union(set([v for (u, v, w) in edge_list]))
degrees = [d for (u, d) in g.degree(nbunch=list(nodes), weight='weight')]
#
percentage_names = None
if frequent_co_authors != None and show_percentage_names == 0:
percentage_names = frequent_co_authors
elif frequent_co_authors == None and show_percentage_names != 0:
min_percentage_weight = sorted(degrees, reverse=True)[:int(len(degrees) * (show_percentage_names / 100))][-1]
percentage_names = [u for (u, d) in g.degree(nbunch=list(nodes), weight='weight') if d >= min_percentage_weight]
print('The webpage will show following {} names in default:'.format(len(percentage_names)))
print('\t' + ', '.join(sorted(percentage_names, key=lambda name: name.split(' ')[-1])))
elif frequent_co_authors == None and show_percentage_names == 0:
percentage_names = None
else:
print("Whether 'frequent_co_authors' should be 'None' or 'show_percentage_names' should be zero.")
exit()
return edge_list, percentage_names
def read_and_write_html(path):
with open(path, 'r') as f:
html_file = f.read()
start_idx = html_file.index('<script type="text/javascript">var wwdata')
end_idx = html_file.index('</script>', start_idx) + 9
html_data = html_file[start_idx:end_idx]
with open('./template.html', 'r') as f:
html_file = f.read()
start_idx = html_file.index('<script\n type="text/javascript">var wwdata')
end_idx = html_file.index('</script>', start_idx) + 9
# print(html_file[start_idx:end_idx])
html_final = html_file.replace(html_file[start_idx:end_idx], html_data)
with open(path, 'w') as f:
f.write(html_final)
def main(fre_co_authors=None,):
# load data
f_ext = args.file.split('.')[-1]
if f_ext == 'xml':
co_author_lst = load_dblp_xml(path=args.file)
elif f_ext == 'bib' or 'txt':
co_author_lst = load_bib(path=args.file)
else:
print("Wrong file name, is it has an extension of 'bib', 'xml', or 'txt'?.")
exit()
# build graph
edge_list, percentage_names = build_graph(co_author_lst,
args.min_edge_weight,
frequent_co_authors=fre_co_authors,
show_percentage_names=args.show_percentage_names)
number_co_authors = len(set([u for (u, v, w) in edge_list] + [v for (u, v, w) in edge_list]))
# build webweb's web
web = Web(title=args.name_to_match)
web.networks.xovee(
adjacency=edge_list,
)
# display options
web.display.colorBy = args.color_by
web.display.sizeBy = args.size_by
web.display.charge = args.charge
web.display.linkLength = args.link_length
web.display.scaleLinkOpacity = args.scale_link_opacity
web.display.scaleLinkWidth = args.scale_link_width
web.display.nameToMatch = args.name_to_match
web.display.radius = args.radius
web.display.showNodeNames = args.show_node_names
web.display.hideMenu = args.hide_menu
web.display.showLegend = args.show_legend
web.display.frequentCoAuthors = percentage_names
web.display.h = args.canvas_height
web.display.w = args.canvas_width
web.display.numberCoAuthor = number_co_authors
web.display.displayName = args.display_name
# web.show()
web.save(args.file.split('.')[0] + '.html')
read_and_write_html(args.file.split('.')[0] + '.html')
# change display name
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='CANV Descriptions')
# parameters
parser.add_argument('file', type=str, help='Name of the data file.')
parser.add_argument('--data', default='xml', type=str, choices=['bib', 'xml'], help='Type of the data file.')
parser.add_argument('--display_name', default='Xovee Xu', type=str, help='The name displayed on the webpage.')
parser.add_argument('--min_edge_weight', default=0, type=float, help='Edge weight less than this will be removed.')
parser.add_argument('--color_by', default='strength', type=str, choices=['degree', 'strength'], help='Color nodes by.')
parser.add_argument('--size_by', default='strength', type=str, choices=['degree', 'strength'], help='Size nodes by.')
parser.add_argument('--charge', default=256, type=int, help='Charge of the graph.')
parser.add_argument('--link_length', default=200, type=int, help='Length of the links.')
parser.add_argument('--scale_link_opacity', default=1, type=int, help='Scale link opacity.')
parser.add_argument('--scale_link_width', default=1, type=int, help='Scale link width.')
parser.add_argument('--name_to_match', default='', type=str, help='Show a specific node name.')
parser.add_argument('--radius', default=15, type=int, help='Radius.')
parser.add_argument('--show_node_names', default=0, type=int, help='Whether show all node names in default.')
parser.add_argument('--hide_menu', default=1, type=int, help='Whether hide the webpage menu.')
parser.add_argument('--show_legend', default=0, type=int, help='Whether show the legend of canvas.')
parser.add_argument('--show_percentage_names', default=10, type=int, help='Percentage 0: show no names; for who have many co-authors, preferably ~5-15.')
parser.add_argument('--canvas_height', default=700, type=float, help='Height of the canvas.')
parser.add_argument('--canvas_width', default=1000, type=float, help='Width of the canvas.')
args = parser.parse_args()
frequent_co_authors = None # Co-author names to display
main(fre_co_authors=frequent_co_authors)