-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_email.py
60 lines (52 loc) · 1.48 KB
/
extract_email.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
import html.parser as h
import requests
from sys import argv
def page_content(url=""):
try:
return requests.get(url).text
except:
print("\n--< wrong url address\n")
return ""
def find_emails(rC=[]):
reForm = re.compile(r'[\w\.-]+@[\w\.-]+')
try:
emails = reForm.findall(rC) #rC - replaced content
return list(set(emails))
except:
return []
def find_html_sign(htmlStr=""):
if not htmlStr:
return [] #htmlStr = "some @ and {"
reForm = re.compile(r'&#\d+;') #my 1st useful regex
thing = reForm.findall(htmlStr)
return list(set(thing)) #we dont really need duplicates
def html_to_ascii(htmlStr=""):
if not htmlStr:
return "" #htmlStr = '@' #just an example
return h.unescape(htmlStr)
def convert_page(C=""):
if not C:
return ""
signs = find_html_sign(C)
for item in signs:
C = C.replace(item, html_to_ascii(item))
#this is the most fragile part
for item in ["'", " ", "+"]:
C= C.replace(item, "")
return C
def main(argv):
url = ""
if len(argv)>0 and type(argv) is list:
url = argv[0]
elif type(argv) is str:
url = argv
C = page_content(url) #get full content
if not C:
return []
rC = convert_page(C) #convert to ascii and replace some stuff
emails = find_emails(rC) #extract emails
return emails
if __name__ == "__main__":
emails = main(argv[1:])
print(emails)