-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform_from_json_data_to_yaml_manifest.py
85 lines (70 loc) · 3.05 KB
/
transform_from_json_data_to_yaml_manifest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Desarrollo a medias. Lo abandono para mejorar las soluiciones basadas en pdfminer.six o en pdftotext
# Sirve para generar medianamente bien un índice (index), pero no para el resto del contenido (index2)
# Emplea como input el output de extract_text_....
# La dificultad reside en que los textos aparecen fragmentados en el structuredData
import json
import sys
import yaml
import os.path
import uuid
# get base path.
base_path = os.path.dirname(os.path.abspath(__file__))
def letters_only(string):
text = ""
for char in string:
if char.isalpha() or char.isspace():
if char.isspace() and text == "":
continue
text = text + char
return text
def convert_json_to_yaml_manifest(file):
filename = file.split('.')
out_file = filename[0] + '.yaml'
input_dir = base_path + "/inputs/"
output_dir = base_path + "/outputs/"
res = {}
#element_types = {['header': 'H', 'toc': 'TOC', 'paragraph': 'P', 'list': 'L', 'list_item': 'LI', 'list_body': ' 'span': 'Span']}
with open(input_dir + file, 'r') as input_file, open(output_dir + out_file, "w") as output_file:
data = json.load(input_file)
data = data['elements']
res['id'] = str(uuid.uuid4())
res['title'] = filename[0].replace("_", " ").replace("-", " ").upper()
res['index'] = []
res['index2'] = []
count = -1
index = 0
for element in data:
count += 1
try:
path = element['Path'].replace("//Document/", "")
#print(str(count) , end= ' ')
#print(path)
sub_paths = path.split('/')
levels = len(sub_paths)
# try to build index from TOC
## problems detected:
## - not able to identify hierarchy (children)
if(sub_paths[0] == 'TOC'):
if element['Text']:
text = ""
# remove any characters that ARE NOT letters
text = letters_only(element['Text'])
if text != "":
index += 1
res['index'].append({'chapter': index, 'title': text.strip().capitalize()})
# try to build index from content
if(sub_paths[0] != 'TOC' and sub_paths[0] != 'Figure'):
if "H1" in sub_paths[0]:
if element['Kids']:
print('kids')
text = ""
for kid in element['Kids']:
text = text.join(letters_only(kid['Text']))
if text != "":
index += 1
res['index2'].append({'chapter': index, 'title': text.strip().capitalize()})
except:
continue
yaml.dump(res, output_file, allow_unicode=True)
filename = sys.argv[1]
convert_json_to_yaml_manifest(filename)