This repository has been archived by the owner on May 9, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 18
/
crawler.py
90 lines (69 loc) · 2.86 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import json
import hashlib
import requests
from bs4 import BeautifulSoup
from util import read_file, write_file
archived_html_dir = 'archived_html'
urls_crawled_filename = f"{archived_html_dir}/urls.json"
def get_html_content(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.88 Safari/537.36'
}
r = requests.get(url, headers=headers)
return r.text
def get_urls_crawled():
if os.path.exists(urls_crawled_filename):
return json.loads(read_file(urls_crawled_filename))
return []
def save_urls_crawled(urls):
write_file(json.dumps(urls, ensure_ascii=False, indent=4,
separators=(',', ':')), urls_crawled_filename)
def crawl(pages, urls_crawled):
urls = []
for p in pages:
url = f"https://wsjkw.sh.gov.cn/yqtb/" if p == '' else f"https://wsjkw.sh.gov.cn/yqtb/index{p}.html"
html_content = get_html_content(url)
soup = BeautifulSoup(html_content, 'html.parser')
hyperlink_elements = soup.select('.list-date li')
for hyperlink_element in hyperlink_elements:
hyperlink_text = hyperlink_element.text
hyperlink_url = hyperlink_element.a['href']
target_url = hyperlink_url
if not target_url.startswith("http"):
target_url = 'https://wsjkw.sh.gov.cn' + hyperlink_url
if target_url in set(map(lambda x: x['url'], urls_crawled)):
continue
hyperlink_html_content = get_html_content(target_url)
hashname = hashlib.md5(
hyperlink_html_content.encode('utf8')).hexdigest()
filename = f"{hashname}.html"
urls.append(
{"url": target_url, "text": hyperlink_text, "filename": filename})
write_file(hyperlink_html_content,
f"{archived_html_dir}/{filename}")
return urls
def crawl_url(target_url, text):
urls = []
hyperlink_html_content = get_html_content(target_url)
hashname = hashlib.md5(
hyperlink_html_content.encode('utf8')).hexdigest()
filename = f"{hashname}.html"
urls.append(
{"url": target_url, "text": text, "filename": filename})
write_file(hyperlink_html_content,
f"{archived_html_dir}/{filename}")
return urls
if __name__ == '__main__':
# pages = ['', "_1", "_2", "_3", "_4", "_5", "_6"]
# pages = ['']
# urls_crawled = get_urls_crawled()
# urls = crawl(pages, urls_crawled)
# urls.extend(urls_crawled)
# save_urls_crawled(urls)
url = "https://mp.weixin.qq.com/s/Zpll7k6wZfJiPNeV8sz6Ig"
text = "5月24日(0-24时)本市各区确诊病例、无症状感染者居住地信息"
urls_crawled = get_urls_crawled()
urls = crawl_url(url, text)
urls.extend(urls_crawled)
save_urls_crawled(urls)