-
Notifications
You must be signed in to change notification settings - Fork 0
/
tesseract.py
101 lines (82 loc) · 3.47 KB
/
tesseract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import pytesseract
from PIL import Image
import Levenshtein
import numpy as np
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm
def get_font_style(font_name):
"""
Returns boolean dictionary indicating font styles based on font_name.
"""
font_name = font_name.lower()
return {
'regular': 'regular' in font_name,
'mono': 'mono' in font_name,
'black': 'black' in font_name,
'semibold': 'semibold' in font_name,
'bold': 'bold' in font_name,
'extrabold': 'extrabold' in font_name,
'italic': 'italic' in font_name,
'light': 'light' in font_name,
'extralight': 'extralight' in font_name,
'semibold': 'semibold' in font_name,
'thin': 'thin' in font_name,
'medium': 'medium' in font_name,
'semicondensed': 'semicondensed' in font_name,
'condensed': 'condensed' in font_name,
'extracondensed': 'extracondensed' in font_name,
'compact': 'compact' in font_name,
'oblique': 'oblique' in font_name
}
def count_png(path):
png_count = 0
# Walk through the directory and its subdirectories
for root, dirs, files in os.walk(path):
# Iterate over the files in the current directory
for filename in files:
# Check if the file ends with '.png' (case insensitive)
if filename.lower().endswith(".png"):
# Increment the PNG counter
png_count += 1
return png_count
def create_evaluation_csv(experiment, csv_name='tesseract_experiment.csv'):
frame = pd.DataFrame(experiment)
frame.to_csv(csv_name, index=False)
def experiment(dataset_path=os.path.join('dataset', 'image')):
with open(os.path.join('sample.txt'), 'r') as file:
ground_truth = file.read()
vox_atypl_dir = dataset_path
count = count_png(vox_atypl_dir)
progress_bar = tqdm(total=count, desc='Processing files', unit='file')
experiment = []
for root, dirs, files in os.walk(dataset_path):
for dir in dirs:
for filename in os.listdir(os.path.join(dataset_path, dir)):
img_path = os.path.join(dataset_path, dir, filename)
score = tesseract_eval(img_path, ground_truth)
font_name = Path(img_path).stem
font_name = font_name.replace("(underlined)", "")
font_name = font_name.replace("(bold)", "")
font_size = font_name.split("_")[-1]
font_name = ''.join(font_name.split("_")[:-1])
experiment.append({'font_name':font_name,
'accuracy':f"{score:.2f}%",
'font_size': font_size,
'vox_atypl': dir,
'underlined': 'underlined' in Path(img_path).stem,
'bold': 'bold' in Path(img_path).stem
})
progress_bar.update(1)
progress_bar.close()
return experiment
def tesseract_eval(img_path, ground_truth):
img = Image.open(img_path)
extracted_text = pytesseract.image_to_string(img)
dist = Levenshtein.distance(extracted_text, ground_truth)
score = ((len(ground_truth) - dist) / len(ground_truth)) * 100
return score
if __name__ == '__main__':
experiment = experiment()
create_evaluation_csv(experiment)