-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_text.py
28 lines (26 loc) · 884 Bytes
/
extract_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# encoding=utf8
import PyPDF2
import os
# filename = 'Frankenstein.pdf'
def text_ext(filename):
"""
Function to extract text from pdf and save it in txt format
:param filename:
:return:
"""
dump = ''
pdfFileObj = open(filename, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
for i in range(pdfReader.numPages):
pageObj = pdfReader.getPage(i)
# print(pageObj.extractText())
dump += pageObj.extractText()
with open(filename[0:-4]+'.txt','wb') as fo:
fo.write(dump.encode('utf-8'))
for (dirname, dirs, files) in os.walk('.'):
# walk the finance directory or the parent dir to convert pdfs to txts
for filename in files:
if filename.endswith('.pdf') :
thefile = os.path.join(dirname,filename)
text_ext(thefile)
print("Finished processing {}".format(thefile))