-
Notifications
You must be signed in to change notification settings - Fork 0
/
build
executable file
·356 lines (293 loc) · 11.4 KB
/
build
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/bin/bash
# different modes of running the build
PROCESS_SLOVNIK=true
EXIT_ON_FAILURE=true
CLEAN_TEMP_DATA=true
RUN_TESTS=true
SET_TRAP=true
COPY_FINAL=true
SYLLABLE=true
SLOVNIK_PASS=
ARCHIVE=
set -b
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-ns|--no-slovnik)
PROCESS_SLOVNIK=
RUN_TESTS=
;;
-nt|--no-tests)
RUN_TESTS=
;;
-nf|--no-fail)
EXIT_ON_FAILURE=
;;
-nsl|--no-syllable)
SYLLABLE=
;;
-c|--clean)
make clean-code
;;
-d|--debug)
CLEAN_TEMP_DATA=
;;
-nt|--no-trap)
SET_TRAP=
;;
-nc|--no-copy)
COPY_FINAL=
;;
-p|--pass)
SLOVNIK_PASS="$2"
shift;
;;
-a|--archive)
ARCHIVE=true
;;
--help)
echo "./build [--help --no-slovnik --no-tests --no-fail --no-syllable --clean --debug --no-trap --no-copy --archive]"
exit 0;
;;
*)
>&2 echo "Unknown option" $key
exit 1;
;;
esac
shift;
done
if [ $PROCESS_SLOVNIK ] ; then
SALT="%D*HUKFTV:"
function pass {
echo Enter Slovnik password \(or rerun with option --no-slovnik\):
read -s SLOVNIK_PASS
hash_pass
}
function hash_pass {
PASS_HASH=$(echo -n ${SLOVNIK_PASS}$SALT | sha512sum | sha512sum | awk '{print $1}')
}
if [ -z "$SLOVNIK_PASS" ] ; then
pass
else
hash_pass
fi
while [[ "$PASS_HASH" != "6d70366494981ca988a60c2e9f72fc157496896a602981c80ba828e1ac5030462fc4704bff5cb475accf8dc0a3efac703922c47c33e3ecf5c1163ad7ae89378f" ]]
do
echo -e "Slovnik pass is wrong. Try again."
pass
done
fi
if [ $EXIT_ON_FAILURE ] ; then
set -e
set -o pipefail
fi
SCRATCH_PATH=$(mktemp -d -t bdict.XXXXXXXXXX)
RESOURCES_PATH=resources
PIPELINE_PATH=pipeline
SLOVNIK_PATH=$SCRATCH_PATH/slovnik
DB_RECHKO_PATH=$SCRATCH_PATH/rechko.db # this allows concurrent insertion
DB_MAIN_PATH=$SCRATCH_PATH/main.db # in the end, everything is merged here
DB_FINAL_PATH=dictionary.$(git describe).db # where the artifact gets deployed (relative to PWD)
SLOVNIK_CSV_PATH=$SCRATCH_PATH/slovnik.csv
if [ $SET_TRAP ] ; then
function finish {
exit_status=$?
set +e
echo Temporary files removed.
rm -rf "$SCRATCH_PATH"
echo Killing background jobs
jobs -p | xargs kill 2>/dev/null -9
exit "$exit_status"
}
trap finish EXIT;
fi
(
if [ $COPY_FINAL ] ; then
echo Cleaning previous builds
make clean-result
fi
echo Building DB setup code
make
if [ $PROCESS_SLOVNIK ] ; then
##### SLOVNIK SECTION START #####
echo Decrypting and unpacking Slovnik
openssl enc -aes-256-cbc -d -in $RESOURCES_PATH/slovnik.tar.gz.enc -pbkdf2 -pass "pass:$SLOVNIK_PASS" | tar -xz -C $SCRATCH_PATH
# assembling slovnik into one csv file
rm -f $SLOVNIK_PATH/all.txt;
for f in $SLOVNIK_PATH/*.txt; do
(cat "${f}"; printf "\n") >> $SLOVNIK_CSV_PATH
rm -f $f
done
rm -rf $SLOVNIK_PATH
echo Importing slovnik wordforms
sqlite3 $DB_MAIN_PATH ".read pipeline/10_create_slovnik_wordform.sql"
./bin/import_slovnik $DB_MAIN_PATH $SLOVNIK_CSV_PATH
rm -f $SLOVNIK_CSV_PATH
sqlite3 $DB_MAIN_PATH ".read pipeline/11_fix_slovnik_errors.sql" ".read pipeline/12_update_pos_slovnik.sql"
##### SLOVNIK SECTION END #####
fi
###### MURDAROV SECTION START #####
echo Importing Murdarov lemmata
sqlite3 $DB_MAIN_PATH "CREATE TABLE murdarov_lemma(lemma_stressed TEXT);" ".mode csv" ".import $RESOURCES_PATH/murdarov.txt murdarov_lemma" \
".read pipeline/13_import_murdarov_lemma.sql"
##### MURDAROV SECTION END #####
###### RBE SECTION START #####
echo Unpacking RBE
zcat $RESOURCES_PATH/db-rbe.sql.gz > $SCRATCH_PATH/db-rbe.sql
echo Cleaning RBE
parallel -a $SCRATCH_PATH/db-rbe.sql --block 30M --pipe-part \
"sed -n -E -f $PIPELINE_PATH/20_*.sed |\
sed -E -f $PIPELINE_PATH/21_*.sed" \
>$SCRATCH_PATH/db-rbe.csv
rm -f $SCRATCH_PATH/db-rbe.sql
echo Importing RBE lemmata into database;
sqlite3 $DB_MAIN_PATH ".read pipeline/30_create_rbe_lemma.sql" ".mode csv" ".separator ^" ".import $SCRATCH_PATH/db-rbe.csv rbe_lemma";
sqlite3 $DB_MAIN_PATH "CREATE VIRTUAL TABLE rbe_lemma_ft USING fts5(source_definition, content=rbe_lemma);" \
\ "INSERT INTO rbe_lemma_ft SELECT SUBSTR(source_definition, 1, MAX(INSTR(source_definition, '<b>1.'), 150)) FROM rbe_lemma;";
sqlite3 $DB_MAIN_PATH ".read pipeline/31_add_pos_to_rbe_lemma.sql" ".load lib/libextfun" ".read pipeline/32_fix_rbe_lemma_errors.sql"
sqlite3 $DB_MAIN_PATH "CREATE INDEX IF NOT EXISTS idx_rbe_lemma_lemma_pos ON rbe_lemma(lemma, pos);" \
\ "CREATE INDEX IF NOT EXISTS idx_rbe_lemma_lemma_with_stress_pos ON rbe_lemma(lemma_with_stress, pos);";
rm -f $SCRATCH_PATH/db-rbe.csv;
##### RBE SECTION END #####
) &
(
##### RECHKO SECTION START #####
echo Unpacking Rechko
zcat $RESOURCES_PATH/db-rechko.sql.gz > $SCRATCH_PATH/db-rechko.sql
echo Converting Rechko MySQL schema dump to SQLite
# first extract schema into a separate file
./tools/mysql2sqlite.awk <( head -n 450 $SCRATCH_PATH/db-rechko.sql ) > $SCRATCH_PATH/db_schema_rechko_all.sqlite
sed -n -E -e '12,26 p' $SCRATCH_PATH/db_schema_rechko_all.sqlite > $SCRATCH_PATH/db_schema_rechko_wordform.sqlite
# then words/lemmata and word types into a sqlite file, since their definitions are hard to convert to csv and they are small
./tools/mysql2sqlite.awk <( sed -n -E -e '1268,1301 p' -e '1370 p' $SCRATCH_PATH/db-rechko.sql ) > $SCRATCH_PATH/rechko_lemma_and_types.sqlite
echo Importing Rechko schema, lemmata and types
sqlite3 $DB_RECHKO_PATH ".read $SCRATCH_PATH/db_schema_rechko_all.sqlite" ".read $SCRATCH_PATH/rechko_lemma_and_types.sqlite"
if [ $CLEAN_TEMP_DATA ] ; then
rm -f $SCRATCH_PATH/db_schema_rechko_all.sqlite
rm -f $SCRATCH_PATH/rechko_lemma_and_types.sqlite
fi
echo Cleaning Rechko
# useful tables from Rechko: derivative_form, word, word_type
sqlite3 $DB_RECHKO_PATH "ALTER TABLE word RENAME TO rechko_lemma;" \
\ \ "CREATE INDEX IF NOT EXISTS idx_rechko_lemma ON rechko_lemma(name);"
# then derivative forms into a csv
echo Extracting rechko derivative forms to csv
sed -n -e "485,1132 p" $SCRATCH_PATH/db-rechko.sql |\
parallel --pipe "sed -n -E -f $PIPELINE_PATH/22_*.sed |\
sed -E -f $PIPELINE_PATH/23_*.sed"\
>$SCRATCH_PATH/rechko_wordform.csv &
RECHKO_PARALLEL_PID=$!
##### RECHKO SECTION END #####
) &
wait
echo Joining RBE lemmata with Rechko lemmata
sqlite3 $DB_MAIN_PATH "ATTACH '$DB_RECHKO_PATH' as rechko;" ".load lib/libextfun" ".read pipeline/33_join_rechko_lemma.sql" "DETACH rechko;"
if [ $CLEAN_TEMP_DATA ] ; then
rm -f $DB_RECHKO_PATH
fi
echo Creating lemma indices
sqlite3 $DB_MAIN_PATH "CREATE INDEX IF NOT EXISTS idx_lemma_lemma_pos ON lemma(lemma, pos);" \
\ "CREATE INDEX IF NOT EXISTS idx_lemma_lemma_stressed_pos ON lemma(lemma_stressed, pos);" \
\ "CREATE INDEX IF NOT EXISTS idx_lemma_num_syllables ON lemma(num_syllables);";
if [ $PROCESS_SLOVNIK ] ; then
echo Joining Slovnik lemmata with lemmata
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/34_join_slovnik_lemma.sql"
fi
echo Joining Murdarov lemmata with lemmata
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/35_join_murdarov_lemma.sql"
echo NER processing in existing lemmata
sqlite3 $DB_MAIN_PATH ".read pipeline/38_ner_processing.sql"
echo Creating lemma fulltext index;
sqlite3 $DB_MAIN_PATH "CREATE VIRTUAL TABLE lemma_ft USING fts5(definition, content=lemma);" "INSERT INTO lemma_ft SELECT definition FROM lemma;";
wait $RECHKO_PARALLEL_PID
echo Importing Rechko wordforms
sqlite3 $DB_MAIN_PATH ".read $SCRATCH_PATH/db_schema_rechko_wordform.sqlite";
sqlite3 $DB_MAIN_PATH ".mode csv" ".separator ^" ".import $SCRATCH_PATH/rechko_wordform.csv derivative_form";
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/40_import_rechko_wordform.sql";
sqlite3 $DB_MAIN_PATH "CREATE INDEX IF NOT EXISTS idx_wordform_wordform_tag ON wordform(wordform, tag);" \
\ "CREATE INDEX IF NOT EXISTS idx_wordform_lemma_id ON wordform(lemma_id);" \
\ "CREATE INDEX IF NOT EXISTS idx_wordform_tag ON wordform(tag);" \
\ "CREATE INDEX IF NOT EXISTS idx_wordform_wordform_stressed ON wordform(wordform_stressed);" \
\ "CREATE INDEX IF NOT EXISTS idx_rechko_wordform_wordform_tag ON rechko_wordform(wordform, tag);";
if [ $CLEAN_TEMP_DATA ] ; then
rm -f $SCRATCH_PATH/db-rechko.sql
rm -f $SCRATCH_PATH/rechko_wordform.csv
rm -f $SCRATCH_PATH/db_schema_rechko_wordform.sqlite
fi
if [ $PROCESS_SLOVNIK ] ; then
echo Joining Slovnik wordforms with Rechko wordforms
sqlite3 $DB_MAIN_PATH "CREATE INDEX IF NOT EXISTS idx_slovnik_wordform_wordform_tag ON slovnik_wordform(wordform, tag);" \
\ "CREATE INDEX IF NOT EXISTS idx_slovnik_wordform_lemma_tag ON slovnik_wordform(lemma, tag);";
sqlite3 $DB_MAIN_PATH ".read pipeline/41_join_slovnik_wordforms.sql";
fi
echo Cleaning lemma and wordforms table
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/45_postprocess_lemma_table.sql"
if [ $RUN_TESTS ] ; then
echo Run tests on lemmata tables
sqlite3 -list $DB_MAIN_PATH ".read pipeline/50_test_lemma.sql" |
diff tests/lemma.test -
echo Run tests on wordform table
sqlite3 -list $DB_MAIN_PATH ".read pipeline/51_test_wordform.sql" |
diff tests/wordform.test -
fi
if [ $CLEAN_TEMP_DATA ] ; then
# drop rechko_lemma and rbe_lemma tables
echo Deleting old lemma tables
sqlite3 $DB_MAIN_PATH "DROP TABLE rbe_lemma;" "DROP TABLE rechko_lemma;" "DROP TABLE rbe_lemma_ft;" "DROP TABLE rechko_word_type;"
echo Deleting rechko wordforms table
sqlite3 $DB_MAIN_PATH "DROP TABLE IF EXISTS derivative_form;" "DROP TABLE IF EXISTS rechko_wordform;";
if [ $PROCESS_SLOVNIK ] ; then
echo Deleting slovnik wordforms
sqlite3 $DB_MAIN_PATH "DROP TABLE IF EXISTS slovnik_wordform;"
fi
echo Vacuum
sqlite3 $DB_MAIN_PATH "VACUUM"
fi
echo Fixing lemmata table
sqlite3 $DB_MAIN_PATH ".read pipeline/61_fix_stress_errors.sql"
echo Detecting morphological derivations
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/62_add_stress_to_lemma.sql" ".read pipeline/63_detect_derivations.sql"
echo Populating lemma table with stresses
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/64_add_stress_to_lemma.sql" ".read pipeline/65_add_stress_to_lemma_manually.sql"
echo Populating wordform table with stresses
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/66_add_stress_to_wordform.sql" ".read pipeline/67_add_stress_to_wordform_manually.sql"
echo Generating accent model
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/69_add_accent_model.sql"
if [ $RUN_TESTS ] ; then
echo Run tests on stresses
sqlite3 -list $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/70_test_stress.sql" 2>/dev/null |
diff tests/stress.test -
fi
if [ $CLEAN_TEMP_DATA ] ; then
sqlite3 $DB_MAIN_PATH "DROP TABLE murdarov_lemma;"
sqlite3 $DB_MAIN_PATH "DROP TABLE derivation;"
fi
echo Create pronunciation table
sqlite3 $DB_MAIN_PATH ".read pipeline/80_create_pronunciation.sql"
echo Add pronunciations
sqlite3 $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/81_add_pronunciation.sql"
if [ $RUN_TESTS ] ; then
echo Run tests on pronunciations
sqlite3 -list $DB_MAIN_PATH ".load lib/libextfun" ".read pipeline/85_test_pronunciation.sql" 2>/dev/null |
diff tests/pronunciation.test -
fi
echo Vacuum
sqlite3 $DB_MAIN_PATH "VACUUM"
if [ $SYLLABLE ] ; then
echo Create syllable table
sqlite3 $DB_MAIN_PATH ".read pipeline/90_create_syllable.sql"
echo Generating syllables
./bin/generate_syllable $DB_MAIN_PATH
echo Vacuum
sqlite3 $DB_MAIN_PATH "VACUUM"
fi
if [ $COPY_FINAL ] ; then
echo Persisting temp database
cp $DB_MAIN_PATH $DB_FINAL_PATH
fi
if [ $ARCHIVE ] ; then
echo Archiving dictionary
tar -czvf ${DB_FINAL_PATH%%.db}.tar.gz $DB_FINAL_PATH
fi
echo Build finished. Artifact generated.