-
Notifications
You must be signed in to change notification settings - Fork 0
/
simple_SEC_scraper.py
85 lines (68 loc) · 2.89 KB
/
simple_SEC_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from bs4 import BeautifulSoup
import requests
import re
import xml.etree.ElementTree as ET
import urllib
import pandas as pd
# sample: apple 10-q forms
def get_10Q(CIK, filing_type = '10-q', records = 10):
# sample_search_link = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000320193&type=10-q&dateb=&owner=exclude&count=40'
search_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK='+CIK+'&type='+filing_type+'&dateb=&owner=exclude&count=40'
search_webpage = requests.get(search_url)
search_html_data = search_webpage.text
# This soup has links to document pages. (Tier 1)
# The document pages have links to data. (Tier 2)
soups = BeautifulSoup(search_html_data, "html.parser")
attributes = soups.find_all('a')
links = [i.get('href') for i in attributes]
tier1_links = [l for l in links if 'Archive' in l] # not sure about this
# within tier 1 document page, need to find XML docs
base = 'https://www.sec.gov'
files_created = []
for doc_link in tier1_links[:records]:
document_page = base + doc_link
webpage = requests.get(document_page)
html_data = webpage.text
soup2 = BeautifulSoup(html_data, "html.parser")
# find attributes again, take href
a_tagged_html = soup2.find_all('a')
hrefs = [i.get('href') for i in a_tagged_html]
# filter for #.xml ending
xml_links = [i for i in hrefs if re.search("[0-9][.]xml", i)]
# expect single match
if len(xml_links)==1:
filename = xml_links[0].split('/')[-1]
f = open(filename,'w')
final_xml = base + xml_links[0]
data_doc = requests.get(final_xml)
f.write(data_doc.text)
f.close()
files_created.append(filename)
else:
print(xml_links)
return files_created
def get_numbers_from_xml(xml_file):
# tree = ET.parse('aapl-20151226.xml')
tree = ET.parse(xml_file)
root = tree.getroot()
dicto = {}
for child in root:
# print(root.get(child))
item = child.tag.split('}')[1]
val = child.text
if val is None:
continue
if any( [i in item for i in ['unit', 'context', 'TextBlock']]):
continue
# print (item, val)
dicto[item]=val
dicto['Source'] = xml_file
return pd.Series(dicto)
if __name__ == "__main__":
CIK = '0000320193' # aapl
# files_created = get_10Q(CIK)
# print(files_created)
files_created = ['aapl-20160326.xml', 'aapl-20151226.xml', 'aapl-20150627.xml', 'aapl-20150328.xml', 'aapl-20141227.xml', 'aapl-20140628.xml', 'aapl-20140329.xml', 'aapl-20131228.xml', 'aapl-20130629.xml', 'aapl-20130330.xml']
paired_results = pd.DataFrame([get_numbers_from_xml(filename) for filename in files_created])
print(paired_results)
paired_results.to_csv('Apple_consolidated.csv')