-
Notifications
You must be signed in to change notification settings - Fork 0
/
Tokenizer.spare
1437 lines (1177 loc) · 50.7 KB
/
Tokenizer.spare
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env perl -w
#
# Tokenize.pm: Generic tokenizer.
# 2012-08-22ff: Written by Steven J. DeRose.
#
use strict;
use Getopt::Long;
use Encode;
use charnames ':full';
use Unicode::Normalize;
use Unicode::Normalize 'decompose';
#use Devel::DProf;
use sjdUtils;
our %metadata = (
'title' => "Tokenize",
'description' => "Generic tokenizer",
'rightsHolder' => "Steven J. DeRose",
'creator' => "http://viaf.org/viaf/50334488",
'type' => "http://purl.org/dc/dcmitype/Software",
'language' => "Perl 5",
'created' => "2012-08-22",
'modified' => "2021-09-16",
'publisher' => "http://github.com/sderose",
'license' => "https://creativecommons.org/licenses/by-sa/3.0/"
);
our $VERSION_DATE = $metadata{'modified'};
=pod
=head1 Usage
This is a natural-language tokenizer, intended as a front-end to NLP
software, particularly lexico-statistical calculators.
It can also be used to normalize text without tokenizing, or
as a preprocessor for more extensive NLP stacks.
It is particularly focused on handling a few complex issues well:
=over
=item * Less-common characters such as ligatures, accents,
odd kinds of digits, hyphens, quotes, and spaces, presentation variants,
angstrom vs. a-with-ring, etc.
=item * Many kinds of non-word tokens, such as URIs, Twitter conventions,
numbers, dates, times, email addresses, etc.
=item * Choice of how to divide edge cases such as contractions and possessives
(with and without explicit apostrophes), hyphenated words
(not the same thing as em-dash-separated clauses), etc.
=item * Options to filter out unwanted tokens, such as the non-word types
already mentioned, as well as words listed in a given dictionary, tokens
with given case patterns or special characters, long or short tokens, etc.
=back
=head2 Example
use Tokenizer;
my $myTok = new Tokenizer("characters");
$myTok->setOption("Uppercase_Letter", "lower");
while (my $rec = <>) {
my @tokens = @{myTok->tokenize($rec)};
for my $token (@tokens) {
$counts{$token}++;
}
}
=head1 The process
There are several steps to the process of tokenizing, each controlled
by various options.
Option names appear in B<BOLD>, and values in I<ITALIC> below.
The type of value expected is shown in (parentheses): either (boolean), (int),
or (disp), unless otherwise described.
=head2 1: Expand escaped characters
These options all begin with "X_" and all take (boolean) values,
for whether to expand them to a literal character.
=over
=item * B<X_BACKSLASH> -- A lot of cases are covered.
=item * B<X_URI> -- %-escapes as used in URIs.
Not to be confused with the B<T_URI> option for tokenizing (see below).
=item * B<X_ENTITY> -- Covers HTML and XML named entities and
numeric character references (assuming the caller didn't already parse and
expand them).
=back
=head2 2: Normalize the character set
These options are distinguished by being named in Title_Case with underscores
(following the Perl convention for Unicode character class names.
See L<http://unicode.org/reports/tr44/tr44-4.html#General_Category_Values>.
This all assumes that the data is already Unicode, so be careful of CP1252.
=over
=item * B<Ascii_Only> (boolean) -- a special case.
Discards all non-ASCII characters, and turns control characters (such as
CR, LF, FF, VT, and TAB) to space. If you specify this, you should not specify
other character set normalization options.
=back
All other character set normalization options are of type (disp):
(disp) values that apply to any character category at all:
"keep" -- Don't change the characters
"delete" -- Delete the characters entirely
"space" -- Replace the characters with a space
"unify" -- Convert all matches to a single character (see below)
(disp) values only for Number and its subtypes:
"value" -- Replace with the value
(disp) values only for Letter and its subtypes:
"upper" -- Force to upper-case
"lower" -- Force to lower-case
"strip" -- Decompose (NKFD) and then strip any diacritics
"decompose" -- Decompose (NKFD) into component characters
I<Letter> and its subcategories default to C<keep>; all other
character categories default to C<unify> (see below for the
meaning of "unify" for each case).
B<Note>: A character may have multiple decompositions, or may be
undecomposable. The resulting string will also be in Compatibility decomposition
(see L<http://unicode.org/reports/tr15/>) and
Unicode's Canonical Ordering Behavior. Compatibility decomposition combines
stylistic variations such as font, breaking, cursive, circled, width,
rotation, superscript, squared, fractions, I<some> ligatures
(for example ff but not oe), and pairs like angstrong vs. A with ring,
ohm vs omega, long s vs. s.
C<#unify> changes each character of the given class
to one particular ASCII character to represent the class:
Letter unifies to "A"
Cased_Letter unifies to "A"
Uppercase_Letter unifies to "A"
Lowercase_Letter unifies to "a"
Titlecase_Letter unifies to "Fi"
Modifier_Letter unifies to "A"
Other_Letter unifies to "A"
Mark unifies to " "
Nonspacing_Mark unifies to " "
Spacing_Mark unifies to " "
Enclosing_Mark unifies to " "
Number unifies to "9"
Decimal_Number unifies to "9"
Letter_Number unifies to "9"
Other_Number unifies to "9"
Punctuation unifies to "."
Connector_Punctuation unifies to "_"
Dash_Punctuation unifies to "-"
Open_Punctuation unifies to "("
Close_Punctuation unifies to ")"
Initial_Punctuation unifies to "`"
Final_Punctuation unifies to "'"
Other_Punctuation unifies to "*"
Symbol unifies to "#"
Math_Symbol unifies to "="
Currency_Symbol unifies to "\$"
Modifier_Symbol unifies to "#"
Other_Symbol unifies to "#"
Separator unifies to " "
Space_Separator unifies to " "
Line_Separator unifies to " "
Paragraph_Separator unifies to " "
Other unifies to "?"
Control unifies to "?"
(includes > 64 characters. For example, U+00A0.
Format unifies to "?"
Surrogate unifies to "?"
Private_Use unifies to "?"
Unassigned unifies to "?"
C<unify> can also be used for the Non-word token options (see below); in that
case, each option has a particular value to which matching I<tokens> unify.
Setting the option for a cover category (such as I<Letter>)
is merely shorthand for
setting all its subcategories to that value (some or all subcategories can
still be reset afterward, but any I<earlier> setting for a subcategory
is discarded when you set its cover category).
To get a list of the category options run C<Tokenizer.pm -list>.
These additional character set normalization options can also be used
(but are not Unicode General Categories):
=over
=item * B<Accent> --
These are related to Unicode B<Nonspacing_Mark>,
but that also would include vowel marks, which this doesn't.
I<#decompose> and I<strip> are important value for this option:
the format splits a composed letter+diacritic or similar combination
into its component parts; the latter discards the diacritic instead.
I<#delete> discards the whole accent+letter combination (?).
B<Note>: There is a separate Unicode property called "Diacritic",
but it isn't available here yet.
=item * B<Control_0> -- The C0 control characters.
That is, the usual ones from \x00 to \x1F.
This option only matters if I<Control> is set to C<keep>.
=item * B<Control_1> -- The C1 control characters.
That is, the "upper half" ones from \x80 to \x9F.
B<Note>: These are graphical characters in the common Windows(r) character
set known as "CP1252", but not in Unicode or most other sets.
This option only matters if I<Control> is set to C<keep>.
=item * B<Digit> -- characters 0-9 -- Cf Unicode B<Number>, which is broader.
=item * B<Ligature> characters -- This also includes titlecase and digraph
characters. B<Note>: Some Unicode ligatures, particular in Greek, may also
be involved in accent normalization.
See also L<http://en.wikipedia.org/wiki/Typographic_ligature>
B<(not yet supported)>
=item * B<Fullwidth> --
See L<http://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms>
B<(not yet supported)>
=item * B<Math> -- variants of regular characters, such as script "R".
These are in the Unicode B<Math> general category.
B<(not yet supported)>
=item * B<Nbsp> -- The non-breaking space characters, U+00A0. This
defaults to being changed to a regular space.
=item * B<Soft_Hyphen> -- The soft (optional) hyphen characters,
U+00AD and U+1806. These default to being deleted.
=back
=head2 3: Shorten runs of the same character
These options are all (boolean).
=over
=item * B<N_CHAR> Reduce runs of >= N of the same
word-character in a row, to just N occurrences.
=item * B<N_SPACE> Reduce runs of >= N white-space characters
(not necessarily all the same) to just N.
=back
=head2 4: Non-word tokens
This step can tweak various kinds of non-word tokens, such as
numbers, URIs, etc. The options are of type (disp), but the
only meaningful settings are "keep", "delete", "space", and "unify".
=over
=item * B<T_TIME> tokens, such as "6:24 pm".
=item * B<T_DATE> tokens, such as "2012-08-22" or "2012 BCE".
Month names and abbreviations are not yet supported.
=item * B<T_FRACTION> (including Unicode fraction characters if they
were not already normalized).
=item * B<T_NUMBER> tokens, including signed or unsigned integers, reals,
and exponential notation (however, fractions are dealt with separately).
This does not include spelled-out numbers such as "five hundred".
(not yet supported)
=item * B<T_CURRENCY> tokens, consisting of a currency symbol and a number,
such as $1, $29.95, etc.
=item * B<T_EMOTICON> items
=item * B<T_HASHTAG> items as in Twitter (#ibm)
=item * B<T_USER> items as in Twitter (@john)
=item * B<T_EMAIL> addresses
=item * B<T_URI> items (see also the B<X_URI> unescaping option earlier)
=back
=head2 4: Split tokens
The text can be broken into C<words> at each white-space character(s),
at all individual C<characters>, or C<none> at all. The choice depends on the
I<TOKENTYPE> option.
Then leading and trailing punctuation are broken off.
This prevents leaving parentheses, commas, quotes, etc. attached to words.
However, the script is not smart (at least, yet) about special cases such as:
$12 ~5.2 #1 +12
5'6" 5! 5%
U.S. p.m.
). ." +/-
(a) 501(c)(3)
@user #topic ~a AT&T
e'tre D'Avaux let's y'all and/or
This needs some adjustments re. which punctuation is allowed on which
end. Harder problems include plural genitives: "The three I<dogs'> tails."
and abbreviations versus sentence-ends.
A few special cases are controlled by these ("S_") options, such as
re-mapping contractions and breaking up hyphenated words (by inserting
extra spaces).
=over
=item * B<S_CONTRACTION> can be set to "unify> in order to
expand most English contractions. For example:
won't, ain't, we'll, we'd, we're, we'll, somebody'd,
y'all, let's, gonna, cannot.
Not very useful for non-English text, even like "dell'" or "c'est".
(see also POS/multitagTokens).
=item * B<S_HYPHENATED> break at hyphens, making the hyphen a separate
token. (Doesn't deal with soft hyphens or other B<Format> characters.
=item * B<S_GENITIVE> break "'s" to a separate token. This does not actually
catch all genitives, even in English (and, many "'s" cases in English
can be either genitives or contractions of "is".
B<(not yet supported)>
=back
=head2 6: Filter out unwanted tokens ('words' mode only)
These options are all (boolean) except for B<F_MINLENGTH> and B<F_MAXLENGTH>.
For Boolean filter options, the default is off, which means the tokens
are not discarded.
=over
=item * B<F_MINLENGTH> (int) -- Discard all tokens shorter than this.
=item * B<F_MAXLENGTH> (int) -- Discard all tokens longer than this.
=item * B<F_SPACE> (boolean) -- can be used to delete all white-space items.
=item * Filter by case and special-character pattern
Each of the following (disjoint) categories
can be controlled separately (see also I<--ignoreCase>, I<--Letter>, etc.):
=over
=item * B<F_UPPER> (boolean) -- remove words with only capital or caseless letters
=item * B<F_LOWER> (boolean) -- remove words with only lower case or caseless letters
=item * B<F_TITLE> (boolean) -- remove words with only an initial capital or titlecase
letter, followed by only lower case or caseless letters.
=item * B<F_MIXED> (boolean) -- remove words with at least two capital and/or
titlecase letters, along with any number of lower case or caseless letters.
=item * B<F_ALNUM> (boolean) -- remove words that contain both digits and
letters.
=item * B<F_PUNCT> (boolean) -- remove words that contain both punctuation and
letters. However, hyphens, apostrophes, and periods do no count.
=back
=item * Tokens in any specified B<F_DICT> list. B<F_MINLENGTH> I<4>
(see above) can serve as a passable substitute for a dictionary of
function words.
=back
=head1 Methods
=over
=item * B<new>(tokenType)
Instantiate the tokenizer, and set it up for the I<tokenTYpe> to be
either B<characters> or B<words>.
=item * B<addOptionsToGetoptLongArg(hashRef,prefix)>
Add the options for this package to I<hashRef>, in the form expected by
Getopt::Long. If I<prefix> is provided, add it to the beginning of each
option name (to avoid name conflicts). All the options for this package
are distinct even ignoring case, so callers may ignore or regard case
for options as desired.
=item * B<setOption>(name,value)
Change the value of the named option.
Option names are case-sensitive (but see previous method).
B<Note>: Setting the option for a Unicode cover category
(such as B<Letter> rather than B<Uppercase_Letter>), is merely shorthand for
setting all its subcategories to that value
(subcategories can still be reset afterward).
=item * B<getOption>(name)
Return the present value of the named option.
Option names are case-sensitive.
=item * B<tokenize>(string)
Break I<string> into tokens according to the settings in effect, and return
a reference to an array of them. B<Note>: This method uses several other
internal methods; they can be invoked separately is desired, but are not
documented fully here; the methods are as shown below ($s is a string to
handle):
$s = $tkz->expand($s);
$s = $tkz->normalize($s);
$s = $tkz->shorten($s);
$s = $tkz->nonWordTokens($s);
@tokens = @{$tkz->splitTokens($s)};
@tokens = @{$tkz->filter(\@tokens)};
=back
=head1 A few examples
Can we expand A to 'A', B to 'B', < to '>'?
And unbackslash \n \r\\\" \x42 \u0043 \U------44?
And unpercent %42 and some UTF like %C8%82?
AT&T is a company, as is B&O, and other companies....
But then (I think), (a) is a label. So is [bracket] and {brace}.
http://bit.ly/840284028#xyz or email me at foo@att.com.
How do we deal with #twitter tags and @userids? Yay!!!!!!!!!
Emoticons like :) and :( and :P are a pain, even at sentence end :).
Contractions it's good to have but we cannot, 'til we're gonna add 'em.
But we don't get foreign words d'jour; c'est la vie.
12:45pm on June 15, 2012, i.e., 2012-06-15.
What happened in the 20's? aja the '20s or just 20s. As in, 2012 CE.
It's a 1-horse (one-horse) town--with one horse-- right?
25mm is 1", which is smaller than 5'4". MCMLX.
12 deg. C is far warmer than 12K, unless that's 1/3 to 1-1/2 of your RAM.
-3.145000 == 27.3% of 12,001, or -2.4. 3.14E+28; it costs $12.1M or $.99.
2x4 is a board; 4x4 can also be a car they might drive in M*A*S*H.
But C++ and C# are programming languages and A-1 is a steak-sauce.
He said, "I'm the Dr." "Dr. Who?" Just the Dr., from Galifrey.
=head1 Known Bugs and Limitations
=over
Significant issues:
Breaks unary minus, $ (and other currency?) off from numbers
Breaks % off end of numbers
doesn't break "that's"
breaks "'em", "'til", "C++"
Fails to break '.' or ',' off some word-ends. or }s
Doesn't split multiple final punc apart: ",
Too generous about expanding contractions (e.g. "Tom's")
Other issues:
Not all options are finished. For example:
I<Ligature, Math, Fullwidth, S_GENITIVE,> etc.
I<T_NUMBER> is disabled for the moment.
Titlecase characters, etc.
Some of this can be done in a pre-pass with:
iconv -f utf8 -t ascii//TRANSLIT
(disp) values upper, lower, and decompose do not restrict themselves
to just a single category, but affect all if set for any.
Can't distinguish single vs. double quotes while unifying variants.
Can't break words in orthographies that lack spaces (such as many ideographic
scripts).
W/ testTokenizer defaults, turns B&O into 9/9&O ... into \.\.\. doesn't
separate }. Default unifies URIs, emails, and some (?) numerics. Doesn't do @userid.
Maybe move Unification out of Tokenizer?
=back
=head1 Related commands
C<vocab>, C<ngrams>, C<normalizeSpace>, C<SimplifyUnicode>,
C<volsunga>, C<findNERcandidates>,....
=head1 History
# 2012-08-22ff: Written by Steven J. DeRose, based on
# variations in tuples, vocab, volsunga, etc. Rationalize....
# 2012-08-29 sjd: Change option values from strings to numbers. Profile a bit.
# Fix some regexes.
# 2012-09-04f sjd: Fix and doc ucc unify values and inheritance. Use in 'vocab'.
# Factor out more regexes, and precompile for speed. Rest of currency.
# Provide API for doing normalizing but not tokenizing
# 2012-09-10 sjd: break at = : ... emdash regardless of T_HYPHEN.
# 2013-08-29: Comment out digit->9 change in normalize(). Improve regexes
# for percent, fraction, etc.
# 2014-04-10: Add temporal words. Improve abbreviation-detection (attach periods).
# Change numerics to default to 'keep', not 'unify'.
#
# More cases:
# box(es).
# Slash? 1/2, 1-1/2, b/c, and/or, he/she, 3/day,...
# Abbreviation periods? Genitives?
# Single/double quotes?
# US$ CA$ U.S. loses final period, etc.
# Personal names with middle initial with period not caught
# Domain names (cf $tld)
# Hex numbers
# Sentence-leading words pulled into NER too readily.
# DNA sequences: [-ACGT]{10,}
=head1 To do (see also "Known Bugs and Limitations")
# Profile
# Can we switch to UCS2 instead of UTF8 for speed?
# Takes \d+ to 9999 too early, kills dates and numeric char refs.
# Way to return tokens with accompanying token-types
# Break at comma, --; trailing punc? trailing balancing punc?
# Option to control digit->9 change in normalize(). And move later?
# Option to ignore text in () etc.
# Leaves " on start/end of tokens -- "i )"
# U+00AD is considered \p{Control}!!!! (soft hyphen)
# Ignore case on URI matching, email,....
# Tokens with any non-ASCII chars
# Drop F_SPACE?
# Unicode Word_Break and Sentence_Break properties
# Ditch "filtering" options?
=head1 History
2012-08-22ff: Written by Steven J. DeRose.
2014-04-10
2021-09-16: Cleanup.
=head1 Rights
Copyright 2012-08-22 by Steven J. DeRose. This work is licensed under a
Creative Commons Attribution-Share Alike 3.0 Unported License.
For further information on this license, see
L<https://creativecommons.org/licenses/by-sa/3.0>.
For the most recent version, see L<http://www.derose.net/steve/utilities> or
L<https://github.com/sderose>.
=cut
###############################################################################
#
#package LanguageSpecific;
# Cf 'vocab' script
our $titles = "Mr|Dr|Mrs|Ms|Messr|Messrs|Rev|Fr|St|Pres|Gen|Cpl";
our $months = "January|February|March|April|May|June|" .
"July|August|September|October|November|December|" .
"Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sept?|Oct|Nov|Dec|";
our $weekdays = "Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|" .
"Mon|Tues?|Weds?|Thurs?|Fri|Sat|Sun|";
our $relative = "today|tomorrow|yesterday|eve";
our $daypart = "morning|noon|afternoon|night|midnight|dawn|dusk|matins|vespers|lauds";
our $eras = "BC|AD|BCE|CE";
our $zones = "EST|EDT|CST|CDT|MST|MDT|PST|PDT|Z";
###############################################################################
#
package Tokenizer;
# Reserved set of option values, mainly for how to map char classes.
# Use numbers for faster tests in map().
#
my %dispTypes = (
# Keyword Value Classes that can use it
# No change needed:
"keep" => 1, # "*"
# Done via map():
"unify" => 11, # "*"
"delete" => 12, # "*"
"space" => 13, # "*"
"strip" => 14, # "Letter"
"value" => 15, # "Number"
# Not done via map()... probably should be....
"upper" => 6, # "Letter"
"lower" => 7, # "Letter"
"decompose" => 8, # "Letter"
);
# Cf Datatypes.pm
my %knownTypes = (
'boolean' => '0|1',
'int' => '[-+]?\d+',
'float' => '[-+]?\d+(\.\d+)',
'string' => '.*',
'disp' => '(' . join("|",keys(%dispTypes)) . ')',
);
# See http://perldoc.perl.org/perlunicode.html#Unicode-Character-Properties
# http://unicode.org/reports/tr44/tr44-4.html#General_Category_Values
# (which has 30 entries, omitting the single-character meta ones)
#
# Changes here should be reflected in the Perldoc!
#
my $u = "";
my %ucc = (
# Unicode catagory name Unify Abbrev NumberOfChars
# LETTERS
$u . "Letter" => "A", # "L",
$u . "Cased_Letter" => "A", # "LC",
$u . "Uppercase_Letter" => "A", # "Lu", 01441
$u . "Lowercase_Letter" => "a", # "Ll", 01751
$u . "Titlecase_Letter" => "Fi", # "Lt", 00031
$u . "Modifier_Letter" => "A", # "Lm", 00037
$u . "Other_Letter" => "A", # "Lo", 11788
# MARKS
$u . "Mark" => " ", # "M",
$u . "Nonspacing_Mark" => " ", # "Mn", 01280
$u . "Spacing_Mark" => " ", # "Mc", 00353
$u . "Enclosing_Mark" => " ", # "Me", 00012
# NUMBERS
$u . "Number" => "9", # "N",
$u . "Decimal_Number" => "9", # "Nd", 00460
$u . "Letter_Number" => "9", # "Nl", 00224
$u . "Other_Number" => "9", # "No", 00464
# PUNCTUATION
$u . "Punctuation" => ".", # "P",
$u . "Connector_Punctuation" => "_", # "Pc", 00010 _ etc.
$u . "Dash_Punctuation" => "-", # "Pd", 00023 Not incl. soft hyphens
$u . "Open_Punctuation" => "(", # "Ps", 00072 Parentheses, etc.
$u . "Close_Punctuation" => ")", # "Pe", 00071
$u . "Initial_Punctuation" => "`", # "Pi", 00012 Sided quotes, etc.
$u . "Final_Punctuation" => "'", # "Pf", 00012
$u . "Other_Punctuation" => "*", # "Po", 00434 !"#%&'*,./:;?@\\ etc.
# SYMBOLS
$u . "Symbol" => "#", # "S",
$u . "Math_Symbol" => "=", # "Sm", 00952
$u . "Currency_Symbol" => "\$", # "Sc", 00049
$u . "Modifier_Symbol" => "#", # "Sk", 00115
$u . "Other_Symbol" => "#", # "So", 04404
# SEPARATORS
$u . "Separator" => " ", # "Z",
$u . "Space_Separator" => " ", # "Zs", 00018
$u . "Line_Separator" => " ", # "Zl", 00001
$u . "Paragraph_Separator" => " ", # "Zp", 00001
# OTHER CATEGORIES
$u . "Other" => "?", # "C",
$u . "Control" => "?", # "Cc", 00065
$u . "Format" => "?", # "Cf", 00139 shy, invis, joiner,
$u . "Surrogate" => "?", # "Cs", 00006
$u . "Private_Use" => "?", # "Co", 00006
$u . "Unassigned" => "?", # "Cn",
# BIDI PROPERTIES -- (not yet supported)
# Issue: Colon vs. underscore (Getopt doesn't like colons)
#
# "BiDi_L" => "A", # "L",
# "BiDi_LRE" => "A", # "LRE",
# "BiDi_LRO" => "A", # "LRO",
# "BiDi_R" => "A", # "R",
# "BiDi_AL" => "A", # "AL",
# "BiDi_RLE" => "A", # "RLE",
# "BiDi_RLO" => "A", # "RLO",
# "BiDi_PDF" => "A", # "PDF",
# "BiDi_EN" => "A", # "EN",
# "BiDi_ES" => "A", # "ES",
# "BiDi_ET" => "A", # "ET",
# "BiDi_AN" => "A", # "AN",
# "BiDi_CS" => "A", # "CS",
# "BiDi_NSM" => "A", # "NSM",
# "BiDi_BN" => "A", # "BN",
# "BiDi_B" => "A", # "B",
# "BiDi_S" => "A", # "S",
# "BiDi_WS" => "A", # "WS",
# "BiDi_ON" => "A", # "ON",
);
###############################################################################
#
sub new {
my ($class, $bt) = @_;
if (!$bt) { $bt = "words"; }
my $self = {
version => '2012-08-24',
options => {},
optionTypes => {},
optionHelps => {},
anyFilters => 1,
su => undef,
srcData => "",
tokens => [],
nNilTokens => [], # by place in record
regexes => {},
};
bless $self, $class;
if (!sjdUtils::getUtilsOption("verboseSet")) {
sjdUtils::setVerbose(0);
}
$self->defineOptions();
$self->setOption("TOKENTYPE", $bt);
$self->preCompileRegexes();
#$self->{su} = new SimplifyUnicode();
return($self);
} # new
sub defineOptions { # Move to XSV, and pull in regexes and unify targets
my ($self) = @_;
# Name Datatype Default Help
$self->defineOption('TVERBOSE', 'boolean', 0, '');
$self->defineOption('TOKENTYPE', 'string', 'words', '');
# 1: Expand
$self->defineOption('X_BACKSLASH', 'boolean', 0, '');
$self->defineOption('X_URI', 'boolean', 0, '');
$self->defineOption('X_ENTITY', 'boolean', 0, '');
# 2: Normalize
$self->defineOption('Ascii_Only', 'boolean', 0, '');
$self->defineOption('Accent', 'disp', 'keep', '');
$self->defineOption('Control_0', 'disp', 'keep', '');
$self->defineOption('Control_1', 'disp', 'keep', '');
$self->defineOption('Digit', 'disp', 'keep', '');
$self->defineOption('Fullwidth', 'disp', 'keep', '');
$self->defineOption('Ligature', 'disp', 'keep', '');
$self->defineOption('Math', 'disp', 'keep', '');
$self->defineOption('Nbsp', 'disp', 'space', '');
$self->defineOption('Soft_Hyphen', 'disp', 'delete', '');
for my $u (keys(%ucc)) { # The Unicode categories
$self->defineOption($u, 'disp', 'keep', '');
}
# 3: Shorten
$self->defineOption('N_CHAR', 'int', 0, '');
$self->defineOption('N_SPACE', 'int', 0, '');
# 4: Non-word tokens
$self->defineOption('T_TIME', 'disp', 'keep','');
$self->defineOption('T_DATE', 'disp', 'keep','');
$self->defineOption('T_FRACTION', 'disp', 'keep','');
$self->defineOption('T_NUMBER', 'disp', 'keep','');
$self->defineOption('T_CURRENCY', 'disp', 'keep','');
$self->defineOption('T_PERCENT', 'disp', 'keep','');
$self->defineOption('T_EMOTICON', 'disp', 'keep','');
$self->defineOption('T_HASHTAG', 'disp', 'keep','');
$self->defineOption('T_USER', 'disp', 'keep','');
$self->defineOption('T_EMAIL', 'disp', 'keep','');
$self->defineOption('T_URI', 'disp', 'keep','');
# 5: Special issues
$self->defineOption('S_CONTRACTION', 'disp', 'keep', '');
$self->defineOption('S_HYPHENATED', 'disp', 'keep', '');
$self->defineOption('S_GENITIVE', 'disp', 'keep', '');
# 6: Filter (0 means keep, 1 means filter out)
$self->defineOption('F_MINLENGTH', 'int', 0, '');
$self->defineOption('F_MAXLENGTH', 'int', 0, '');
$self->defineOption('F_DICT', 'string', '', '');
$self->defineOption('F_SPACE', 'boolean', 0, ''); # OBS?
$self->defineOption('F_UPPER', 'boolean', 0, '');
$self->defineOption('F_LOWER', 'boolean', 0, '');
$self->defineOption('F_TITLE', 'boolean', 0, '');
$self->defineOption('F_MIXED', 'boolean', 0, '');
$self->defineOption('F_ALNUM', 'boolean', 0, '');
$self->defineOption('F_PUNCT', 'boolean', 0, '');
} # defineOptions
sub defineOption {
my ($self, $name, $type, $default, $help) = @_;
alogging::vMsg(
2, sprintf("Defining option: %-24s (%8s) = '%s'",
$name, $type, $default));
(!defined $self->{options}->{$name}) ||
die "Duplicate option def for '$name'\n";
($name =~ m/^\w+$/) ||
die "defineOption: Non-word char in name '$name'\n";
(defined $knownTypes{$type}) ||
die "Unknown option type '$type' for option '$name'. Known: " .
join(", ", sort(keys(%knownTypes))) . "\n";
($self->checkType($type,$default)) ||
die "Bad default '$default' for option '$name' of type '$type'\n";
$self->{optionTypes}->{$name} = $type;
$self->{options}->{$name} = $default;
$self->{optionHelps}->{$name} = $help;
} # defineOption
sub checkType {
my ($self, $type, $value) = @_;
(my $typeExpr = $knownTypes{$type}) ||
die "Tokenizer::checkType: Unknown type name '$type'\n";
return(($value =~ m/^$typeExpr$/) ? 1:0);
}
sub setOption {
my ($self, $name, $value) = @_;
if (!defined $self->{options}->{$name}) {
warn "Tokenizer::setOption: Unknown option name '$name'.\n";
return(undef);
}
if (!$self->checkType($self->{optionTypes}->{$name}, $value)) {
warn "Tokenizer::setOption: Bad value '$value' for option '$name'\n";
return(undef);
}
$self->{options}->{$name} = $value;
if (index($name, "F_") == 0) {
$self->setAnyFilters();
warn "setOptions for '$name' led to $self->{anyFilters}.\n";
}
elsif (index($name,'_')>1) { # inherit (not during defineOptions()!)
for my $u (keys(%ucc)) {
next unless ($u =~ m/^$name\_/);
$self->{options}->{$u} = $value;
}
}
return($value);
}
sub setAnyFilters {
my ($self) = @_;
my $nFSet = 0;
for my $op (keys(%{$self->{options}})) {
if ($op =~ m/^F_/ && $self->{options}->{$op}) {
$self->{anyFilters} = 1;
return;
}
}
$self->{anyFilters} = 0;
}
sub getOption {
my ($self, $name) = @_;
if (!defined $self->{options}->{$name}) {
warn "Tokenizer::getOption: Unknown option name '$name'.\n";
return(undef);
}
return($self->{options}->{$name});
}
sub addOptionsToGetoptLongArg {
my ($self,
$getoptHash, # The hash to pass to GetOptions()
$prefix # String to put on front of option names
) = @_;
if (!defined $prefix) { $prefix = ""; }
#alogging::vMsg(1, "In addOptionsToGetoptLongArg()");
if (ref($getoptHash) ne "HASH") {
alogging::eMsg(0,"addOptionsToGetoptLongArg: not a HASH.");
}
my %mapOptType = ( "boolean"=>"!", "int"=>"=i", "float"=>"=f",
"string"=>"=s", "disp"=>"=s", "count"=>"+", );
for my $name (sort keys(%{$self->{options}})) {
if ($name !~ m/^\w+$/) {
alogging::eMsg(
0,"Tokenizer::addOptionsToGetoptLongArg: Bad name '$name'"
);
}
my $dt = $self->{optionTypes}->{$name};
my $suffix = $mapOptType{$dt};
if (!$suffix) {
alogging::eMsg(0,"addOptionsToGetoptLongArg: " .
"Unknown type '$dt' for option '$name'. " .
"Known: (" . join(", ", keys(%mapOptType)) . ").");
$suffix = "!";
}
if (defined $getoptHash->{"$prefix$name$suffix"}) {
alogging::eMsg(
0,"Tokenizer::Option '$prefix$name$suffix' already in hash.");
}
$getoptHash->{"$prefix$name$suffix"} =
sub { $self->setOption($name, $_[1]); };
}
} # addOptionsToGetoptLongArg
###############################################################################
# The real work.
#
sub tokenize {
my ($self, $s) = @_;
my $tokens = undef;
if ($self->{options}->{"TVERBOSE"}) {
$self->{srcData} = $s;
alogging::vMsg(0, "====Tokenize: ", $self->{srcData});
$self->expand();
alogging::vMsg(0, " Expanded: ", $self->{srcData});
$self->normalize();
alogging::vMsg(0, " Normalized: ", $self->{srcData});
$self->shorten();
alogging::vMsg(0, " Shortened: ", $self->{srcData});
$tokens = $self->splitTokens();
alogging::vMsg(0, " Broken: ", "(".join("|",@{$tokens}).")");
($self->{anyFilters}) && $self->filter($tokens);
alogging::vMsg(0, " Filtered: ", "(".join("|",@{$tokens}).")");
}
else {
$self->{srcData} = $s;
$self->expand();
$self->normalize();
$self->shorten();
$self->nonWordTokens();
$tokens = $self->splitTokens();
($self->{anyFilters}) && $self->filter($tokens);
}
#warn "tokenize: ", join("|", @{$tokens}) . "\n";
for (my $i=0; $i<scalar(@{$tokens}); $i++) {
next unless ($tokens->[$i] eq "");
#warn "Nil.\n";
$self->{nNilTokens}->[$i]++;
splice(@{$tokens}, $i, 1);
}
return($tokens);
} # tokenize
###############################################################################
# Since regexes get passed to map(), avoid recompiling every time.
# This alone saves >50% of runtime.
#
sub preCompileRegexes {
my ($self) = @_;
$self->{regexes}->{"Ascii_Only"} = qr/[^[:ascii:]]/;
for my $ugcName (sort(keys(%ucc))) {
$self->{regexes}->{$ugcName} = qr/\p{$ugcName}/;
} # for
$self->{regexes}->{"Accent"} = qr//; ### FIX ###
$self->{regexes}->{"Control_0"} = qr/[\x00-\x1F]/;
$self->{regexes}->{"Control_1"} = qr/[\x80-\x9F]/;
$self->{regexes}->{"Digit"} = qr/[0-9]/;
#$self->{regexes}->{"Fullwidth"} = qr//; ### FIX ###
#$self->{regexes}->{"Ligature"} = qr//; ### FIX ###
#$self->{regexes}->{"Math"} = qr//; ### FIX ###
###################################################### VERY SPECIAL CHARS
$self->{regexes}->{"Nbsp"} = qr/\xA0/;
$self->{regexes}->{"Soft_Hyphen"} = qr/\xAD\u1806/;
###################################################### DATE/TIME
# Doesn't deal with alphabetic times
#
my $yr = '\b[12]\d\d\d'; # Year
my $era = '(AD|BC|CE|BCE)'; # Which half of hx
my $zone = '\s?[ECMP][SD]T'; # Time zone
$self->{regexes}->{"T_TIME"} =
qr/\b[012]?\d:[0-5]\d(:[0-5]\d)?\s*(a\.?m\.?|p\.?m\.?)?($zone)?\b/;
# Also '60s 60's 60s
$self->{regexes}->{"T_DATE"} =
qr/\b($yr".'[-\/][0-3]?\d[-\/][0-3]?\d)|($yr ?$era)\b/;