-
Notifications
You must be signed in to change notification settings - Fork 12
/
toc.py
executable file
·363 lines (299 loc) · 14.5 KB
/
toc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
#! /usr/bin/env python3
from collections.abc import Iterable
import datetime
import logging
from ws.client import API, APIError
from ws.interactive import require_login
from ws.autopage import AutoPage
from ws.parser_helpers.title import canonicalize
import ws.ArchWiki.lang as lang
from ws.interlanguage.CategoryGraph import CategoryGraph
from ws.interlanguage.Categorization import Categorization
from ws.interlanguage.Decategorization import Decategorization
logger = logging.getLogger(__name__)
class LowercaseDict(dict):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def __getitem__(self, key):
return super().__getitem__(key.lower())
def __setitem__(self, key, value):
super().__setitem__(key.lower(), value)
def __contains__(self, key):
return super().__contains__(key.lower())
def get(self, key, default=None):
return super().get(key.lower(), default)
def setdefault(self, key, default=None):
return super().setdefault(key.lower(), default)
def update(self, other):
for key, value in other.items():
super().__setitem__(key.lower(), value)
def pop(self, key, default=None):
return super().pop(key.lower(), default)
class BaseFormatter:
def __init__(self, parents, info, category_names, alsoin=None):
self.parents = parents
self.info = info
self.category_names = category_names
if alsoin is None:
alsoin = {}
alsoin.setdefault("en", "also in")
self.alsoin = alsoin
def format_also_in(self, parents, lang_tag):
alsoin = self.alsoin.get(lang_tag, self.alsoin["en"])
return " ({alsoin} {categories})".format(alsoin=alsoin, categories=", ".join(sorted(parents)))
def localize(self, category):
default = lang.detect_language(category.split(":", 1)[1])[0]
return self.category_names.get(category, default)
class PlainFormatter(BaseFormatter):
def __init__(self, parents, info, category_names, alsoin=None):
super().__init__(parents, info, category_names, alsoin)
self.text = ""
def format_root(self, title):
if isinstance(title, str):
self.text += "{} ({})\n".format(title, self.info[title]["pages"])
elif isinstance(title, Iterable):
# title is a tuple of titles
for t in title:
self.format_root(t)
if len(title) > 1:
self.text += "----\n"
def format_cell(self, title, parent, levels):
lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
# indent
output = " " * len(levels) * 4
# level
output += ".".join(str(x + 1) for x in levels)
# title, number of subpages
output += " {} ({})".format(self.localize(title), self.info[title]["pages"])
# "also in" suffix
parents = set(self.parents[title]) - {parent}
if parents:
parents = [self.localize(cat) for cat in parents]
output += self.format_also_in(parents, lang_tag)
return output
def format_row(self, *columns):
for col in columns:
if isinstance(col, Iterable) and not isinstance(col, str):
self.text += self.format_cell(*col) + "\n"
else:
self.text += str(col) + "\n"
if len(columns) > 1:
self.text += "----\n"
def __str__(self):
return self.text
class MediaWikiFormatter(BaseFormatter):
cell_format = "<span style=\"margin-left:{margin:.3}em;\"><small>{levels}</small> {catlink} <small>{info}</small></span>"
def __init__(self, parents, info, category_names, alsoin=None, include_opening_closing_tokens=True):
super().__init__(parents, info, category_names, alsoin)
self.include_opening_closing_tokens = include_opening_closing_tokens
self.text = ""
def catlink(self, title):
catlink = "[[:{}|{}]]".format(title, self.localize(title))
if lang.is_rtl_language(lang.detect_language(title)[1]):
catlink += "‎"
return catlink
def format_root(self, title):
if isinstance(title, str):
self.text += "| {} <small>({})</small>\n".format(self.catlink(title), self.info[title]["pages"])
self.text += "|-\n"
elif isinstance(title, Iterable):
# title is a tuple of titles
for t in title:
self.text += "| {} <small>({})</small>\n".format(self.catlink(t), self.info[t]["pages"])
self.text += "|-\n"
def format_cell(self, title, parent, levels):
lang_tag = lang.tag_for_langname(lang.detect_language(title)[1])
margin = 1.6 * len(levels)
lev = ".".join(str(x + 1) for x in levels) + "."
info = "({})".format(self.info[title]["pages"])
# "also in" suffix
parents = set(self.parents[title]) - {parent}
if parents:
parents = [self.catlink(cat) for cat in parents]
info += self.format_also_in(parents, lang_tag)
return self.cell_format.format(margin=margin, levels=lev, catlink=self.catlink(title), info=info)
def format_row(self, *columns):
for col in columns:
if isinstance(col, Iterable) and not isinstance(col, str):
self.text += "| " + self.format_cell(*col) + "\n"
elif col:
self.text += "| " + str(col) + "\n"
else:
self.text += "|\n"
self.text += "|-\n"
def __str__(self):
if self.include_opening_closing_tokens is True:
out = "{|\n" + self.text
if out.endswith("\n"):
out += "|}"
else:
out += "\n|}"
return out
return self.text
class TableOfContents:
def __init__(self, api, cliargs):
self.api = api
self.cliargs = cliargs
if self.cliargs.save is False and self.cliargs.print is False:
self.cliargs.print = True
if self.cliargs.toc_languages == ["all"]:
self.cliargs.toc_languages = lang.get_internal_tags()
# strip "(Language)" suffix
self.cliargs.toc_page = lang.detect_language(canonicalize(self.cliargs.toc_page))[0]
# detect page titles
self.titles = []
for ln in sorted(self.cliargs.toc_languages):
if ln == lang.tag_for_langname(lang.get_local_language()):
self.titles.append(self.cliargs.toc_page)
else:
self.titles.append("{} ({})".format(self.cliargs.toc_page, lang.langname_for_tag(ln)))
@staticmethod
def set_argparser(argparser):
import ws.config
# first try to set options for objects we depend on
present_groups = [group.title for group in argparser._action_groups]
if "Connection parameters" not in present_groups:
API.set_argparser(argparser)
output = argparser.add_argument_group(title="output mode")
_g = output.add_mutually_exclusive_group()
# TODO: maybe leave only the short option to forbid configurability in config file
_g.add_argument("-s", "--save", action="store_true",
help="try to save the page (requires being logged in)")
_g.add_argument("-p", "--print", action="store_true",
help="print the updated text in the standard output (this is the default output method)")
group = argparser.add_argument_group(title="script parameters")
group.add_argument("-a", "--anonymous", action="store_true",
help="do not require logging in: queries may be limited to a lower rate")
# TODO: maybe leave only the short option to forbid configurability in config file
group.add_argument("-f", "--force", action="store_true",
help="try to update the page even if it was last saved in the same UTC day")
group.add_argument("--toc-languages", metavar="LANG", default=["all"], nargs="+", choices=["all"] + lang.get_internal_tags(),
help="a list of language tags whose ToC pages should be updated (options: {}, default: %(default)s)".format(lang.get_internal_tags()))
group.add_argument("--toc-page", default="Table of contents",
help="the page name on the wiki to fetch and update (the language suffix "
"is added automatically as necessary) (default: %(default)s)")
# TODO: no idea how to forbid setting this globally in the config...
group.add_argument("--summary", default="automatic update",
help="the edit summary to use when saving the page (default: %(default)s)")
@classmethod
def from_argparser(klass, args, api=None):
if api is None:
api = API.from_argparser(args)
return klass(api, args)
def parse_toc_table(self, title, toc_table):
# default format is one column in the title's language
columns = [lang.tag_for_langname(lang.detect_language(title)[1])]
category_names = LowercaseDict()
alsoin = {}
if toc_table is not None:
# parse data-toc-languages attribute
try:
_languages = str(toc_table.get("data-toc-languages").value)
columns = _languages.split(",")
except ValueError:
toc_table.add("data-toc-languages", value=",".join(columns))
# parse data-toc-alsoin attribute
if toc_table.has("data-toc-alsoin"):
alsoin = self.parse_alsoin(title, str(toc_table.get("data-toc-alsoin").value))
elif columns != ["en"]:
logger.warning("Page [[{}]]: missing 'also in' translations".format(title))
# extract localized category names (useful even for PlainFormatter)
category_names = self.extract_translations(toc_table.contents)
return columns, category_names, alsoin
def parse_alsoin(self, title, value):
alsoin = {}
for item in value.split(","):
item = item.strip()
try:
tag, translation = item.split(":", maxsplit=1)
tag = tag.strip()
translation = translation.strip()
if not lang.is_language_tag(tag):
raise ValueError
except ValueError:
tag = lang.tag_for_langname(lang.detect_language(title)[1])
translation = item
alsoin[tag] = translation
logger.debug("Page [[{}]]: parsed data-toc-alsoin = {}".format(title, alsoin))
return alsoin
def extract_translations(self, wikicode):
dictionary = LowercaseDict()
for wikilink in wikicode.ifilter_wikilinks(recursive=True):
# skip catlinks without leading colon
if not wikilink.title.startswith(":"):
continue
title = self.api.Title(wikilink.title)
if title.namespace == "Category" and wikilink.text:
# skip trivial cases to apply our defaults
pure, _ = lang.detect_language(title.pagename)
if wikilink.text.lower() != title.pagename.lower() and wikilink.text.lower() != pure.lower():
dictionary[str(title)] = str(wikilink.text).strip()
return dictionary
def run(self):
if not self.cliargs.anonymous:
require_login(self.api)
# if we are going to save, make sure that the categories are correct first
if self.cliargs.save is True:
cat = Categorization(self.api)
cat.fix_allpages()
decat = Decategorization(self.api)
decat.fix_allpages()
# build category graph
graph = CategoryGraph(self.api)
# if we are going to save, init wanted categories
if self.cliargs.save is True:
graph.init_wanted_categories()
# detect target pages, fetch content at once
page = AutoPage(self.api, fetch_titles=self.titles)
for title in self.titles:
try:
page.set_title(title)
except ValueError:
# page not fetched
continue
toc_table = page.get_tag_by_id(tag="table", id="wiki-scripts-toc-table")
columns, category_names, alsoin = self.parse_toc_table(title, toc_table)
if toc_table is None:
if self.cliargs.save is True:
logger.error(
"The wiki page [[{}]] does not contain the ToC table. "
"Create the following entry point manually:\n"
"{{| id=\"wiki-scripts-toc-table\"\n...\n|}}".format(title))
continue
else:
logger.warning(
"The wiki page [[{}]] does not contain the ToC table, "
"so there will be no translations.".format(title))
if self.cliargs.print:
ff = PlainFormatter(graph.parents, graph.info, category_names, alsoin)
elif self.cliargs.save:
ff = MediaWikiFormatter(graph.parents, graph.info, category_names, alsoin, include_opening_closing_tokens=False)
else:
raise NotImplementedError("unknown output action: {}".format(self.cliargs.save))
roots = ["Category:{}".format(lang.langname_for_tag(c)) for c in columns]
ff.format_root(roots)
if len(roots) == 1:
for item in graph.walk(graph.subcats, roots[0]):
ff.format_row(item)
elif len(roots) == 2:
for result in graph.compare_components(graph.subcats, *roots):
ff.format_row(*result)
else:
logger.error("Cannot compare more than 2 languages at once. Requested: {}".format(columns))
continue
if self.cliargs.print:
print("== {} ==\n".format(title))
print(ff)
elif self.cliargs.save:
toc_table.contents = str(ff)
if self.cliargs.force or page.is_old_enough(min_interval=datetime.timedelta(days=1), strip_time=True):
try:
page.save(self.cliargs.summary)
except APIError:
pass
else:
logger.info("The page [[{}]] has already been updated this UTC day.".format(title))
if __name__ == "__main__":
import ws.config
toc = ws.config.object_from_argparser(TableOfContents, description="Build a presentation of the wiki's table of contents")
toc.run()