-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract3.rb
executable file
·54 lines (46 loc) · 1.15 KB
/
extract3.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env ruby
require 'json'
require 'htmlentities'
# Calculate person who cites XKCD the most and print output to stdout
# Load xkcd.jsonl
imgs = {}
File.foreach("xkcd.jsonl") do |line|
data = JSON.parse(line)
imgs[data["img"]] = data["id"]
end
stats = {}
# Load raw.jsonl
File.foreach("raw.jsonl") do |line|
data = JSON.parse(line)
if !stats[data["by"]] then
stats[data["by"]] = 0
end
text = HTMLEntities.new.decode(data["text"])
h = {}
# Grab xkcd.com/<id>, www.xkcd.com/<id>
# and m.xkcd.com/<id>.
m = text.scan(/(?:www\.|m\.)?xkcd\.com\/([0-9]+)/i).flatten
m.each do |id|
id = id.to_i
h[id]=true
end
# Grab various forms of "xkcd #1234"
m = text.scan(/xkcd [^a-z0-9]*([0-9]+)/i).flatten
m.each do |id|
id = id.to_i
h[id]=true
end
# Grab imgs.xkcd.com/<img> and convert to comic id
m = text.scan(/(imgs\.xkcd\.com\/.*?(?:png|gif|jpg))/i).flatten
m.each{|img|
i = "https://" + img
if imgs[i] then
h[imgs[i]] = true
else
#puts "unknown: " + i
end
}
stats[data["by"]] += h.length
end
# Sort and emit top 10
puts stats.sort_by {|_key, value| value}.last(10).reverse().to_h