converter/pos_tagger.py

"""
POS tagger using fine-tuned BERT
Most of the code in this module is borrowed from https://github.com/soutsios/pos-tagger-bert
"""
import os
from pathlib import Path

FILE_DIR = os.path.dirname(os.path.realpath(__file__))
STATIC_DIR = os.path.join(FILE_DIR, "static/converter")
STATIC_DIR = Path(STATIC_DIR)
# trained model weights for POS
MODEL_WEIGHTS_DIR = Path(f"{STATIC_DIR}/bert_model/bert_last_epoch.h5")


# Check if the file with the weights exists
if not os.path.exists(MODEL_WEIGHTS_DIR):
    # if not, only expose a tag_pos function that doesn't do anything
    print(
        f"No model weights found at {MODEL_WEIGHTS_DIR}. Converter will not do POS tagging."
    )

    def tag_pos(t):
        return None


else:
    # if the weigts are there, initialise the entire BERT model
    print(
        f"Model weights found at {MODEL_WEIGHTS_DIR}. Loading model for POS tagging..."
    )

    import keras
    import numpy as np

    from keras.layers import Layer
    from keras import backend as K

    import tensorflow as tf
    import tensorflow_hub as hub
    from bert.tokenization import FullTokenizer

    # from tqdm import tqdm_notebook

    class PaddingInputExample(object):
        """Fake example so the num input examples is a multiple of the batch size.
        When running eval/predict on the TPU, we need to pad the number of examples
        to be a multiple of the batch size, because the TPU requires a fixed batch
        size. The alternative is to drop the last batch, which is bad because it means
        the entire output data won't be generated.
        We use this class instead of `None` because treating `None` as padding
        battches could cause silent errors.
        """

    class InputExample(object):
        """A single training/test example for simple sequence classification."""

        def __init__(self, guid, text_a, text_b=None, label=None):
            """Constructs a InputExample.
            Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
                sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
                Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
                specified for train and dev examples, but not for test examples.
            """
            self.guid = guid
            self.text_a = text_a
            self.text_b = text_b
            self.label = label

    def create_tokenizer_from_hub_module(bert_path):
        """Get the vocab file and casing info from the Hub module."""
        bert_module = hub.Module(bert_path)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        vocab_file, do_lower_case = sess.run(
            [
                tokenization_info["vocab_file"],
                tokenization_info["do_lower_case"],
            ]
        )
        return FullTokenizer(
            vocab_file=vocab_file, do_lower_case=do_lower_case
        )  # , spm_model_file=vocab_file

    def convert_single_example(tokenizer, example, max_seq_length=256):
        """Converts a single `InputExample` into a single `InputFeatures`."""

        if isinstance(example, PaddingInputExample):
            input_ids = [0] * max_seq_length
            input_mask = [0] * max_seq_length
            segment_ids = [0] * max_seq_length
            label_ids = [0] * max_seq_length
            return input_ids, input_mask, segment_ids, label_ids

        tokens_a = example.text_a
        if len(tokens_a) > max_seq_length - 2:
            tokens_a = tokens_a[0 : (max_seq_length - 2)]

        # Token map will be an int -> int mapping between the `orig_tokens` index and
        # the `bert_tokens` index.

        # bert_tokens == ["[CLS]", "john", "johan", "##son", "'", "s", "house", "[SEP]"]
        # orig_to_tok_map == [1, 2, 4, 6]
        orig_to_tok_map = []
        tokens = []
        segment_ids = []

        tokens.append("[CLS]")
        segment_ids.append(0)
        orig_to_tok_map.append(len(tokens) - 1)
        # print(len(tokens_a))
        for token in tokens_a:
            tokens.extend(tokenizer.tokenize(token))
            orig_to_tok_map.append(len(tokens) - 1)
            segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(0)
        orig_to_tok_map.append(len(tokens) - 1)
        input_ids = tokenizer.convert_tokens_to_ids(
            [tokens[i] for i in orig_to_tok_map]
        )
        # print(len(orig_to_tok_map), len(tokens), len(input_ids), len(segment_ids)) #for debugging

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        label_ids = []
        labels = example.label
        label_ids.append(0)
        label_ids.extend([tag2int[label] for label in labels])
        label_ids.append(0)
        # print(len(label_ids)) #for debugging
        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append(0)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        return input_ids, input_mask, segment_ids, label_ids

    def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
        """Convert a set of `InputExample`s to a list of `InputFeatures`."""

        input_ids, input_masks, segment_ids, labels = [], [], [], []
        # tqdm_notebook(examples, desc="Converting examples to features"):
        for example in examples:
            input_id, input_mask, segment_id, label = convert_single_example(
                tokenizer, example, max_seq_length
            )
            input_ids.append(input_id)
            input_masks.append(input_mask)
            segment_ids.append(segment_id)
            labels.append(label)
        return (
            np.array(input_ids),
            np.array(input_masks),
            np.array(segment_ids),
            np.array(labels),
        )

    def convert_text_to_examples(texts, labels):
        """Create InputExamples"""
        InputExamples = []
        for text, label in zip(texts, labels):
            InputExamples.append(
                InputExample(guid=None, text_a=text, text_b=None, label=label)
            )
        return InputExamples

    class BertLayer(Layer):
        def __init__(
            self, output_representation="sequence_output", trainable=True, **kwargs
        ):
            self.bert = None
            super(BertLayer, self).__init__(**kwargs)

            self.trainable = trainable
            self.output_representation = output_representation

        def build(self, input_shape):
            # SetUp tensorflow Hub module
            self.bert = hub.Module(
                bert_path, trainable=self.trainable, name="{}_module".format(self.name)
            )

            # Assign module's trainable weights to model
            # Remove unused layers and set trainable parameters
            # s = ["/cls/", "/pooler/", 'layer_11', 'layer_10', 'layer_9', 'layer_8', 'layer_7', 'layer_6']
            s = ["/cls/", "/pooler/"]
            self.trainable_weights += [
                var
                for var in self.bert.variables[:]
                if not any(x in var.name for x in s)
            ]

            for var in self.bert.variables:
                if var not in self._trainable_weights:
                    self._non_trainable_weights.append(var)

            # See Trainable Variables
            # tf.logging.info("**** Trainable Variables ****")
            # for var in self.trainable_weights:
            #    init_string = ", *INIT_FROM_CKPT*"
            #    tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape, init_string)

            print("Trainable weights:", len(self.trainable_weights))
            super(BertLayer, self).build(input_shape)

        def call(self, inputs, mask=None):
            inputs = [K.cast(x, dtype="int32") for x in inputs]
            input_ids, input_mask, segment_ids = inputs
            bert_inputs = dict(
                input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
            )
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                self.output_representation
            ]
            return result

        def compute_mask(self, inputs, mask=None):
            return K.not_equal(inputs[0], 0.0)

        def compute_output_shape(self, input_shape):
            if self.output_representation == "pooled_output":
                return (None, 768)
            else:
                return (None, None, 768)

    # Build model
    def build_model(max_seq_length):
        seed = 0
        in_id = keras.layers.Input(shape=(max_seq_length,), name="input_ids")
        in_mask = keras.layers.Input(shape=(max_seq_length,), name="input_masks")
        in_segment = keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
        bert_inputs = [in_id, in_mask, in_segment]

        np.random.seed(seed)
        bert_output = BertLayer()(bert_inputs)

        np.random.seed(seed)
        outputs = keras.layers.Dense(n_tags, activation=keras.activations.softmax)(
            bert_output
        )

        np.random.seed(seed)
        model = keras.models.Model(inputs=bert_inputs, outputs=outputs)
        np.random.seed(seed)
        model.compile(
            optimizer=keras.optimizers.Adam(lr=0.000008),
            loss=keras.losses.categorical_crossentropy,
            metrics=["accuracy"],
        )
        model.summary(100)
        return model

    def initialize_vars(sess):
        sess.run(tf.compat.v1.local_variables_initializer())
        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(tf.compat.v1.tables_initializer())
        K.set_session(sess)

    def split(sentences, max):
        """
        Split the sentences to MAX_SEQUENCE_LENGTH and so the number of samples increases accordingly.
        For example, if MAX_SEQUENCE_LENGTH=70, a sentence with length 150 splits in 3 sentences: 150=70+70+10
        """
        new = []
        for data in sentences:
            new.append(([data[x : x + max] for x in range(0, len(data), max)]))
        new = [val for sublist in new for val in sublist]
        return new

    def tag_pos(sentence_tokenized):
        # import pdb; pdb.set_trace()

        # split into multiple sentences of max length, if words in input exeed max lenght
        if len(sentence_tokenized) > MAX_SEQUENCE_LENGTH:
            sentence_tokenized = split([sentence_tokenized], MAX_SEQUENCE_LENGTH)
        else:
            sentence_tokenized = [sentence_tokenized]

        # where the results will be stored
        pred_tuples = []

        for sentence_ini in sentence_tokenized:

            tokens_a = sentence_ini

            orig_to_tok_map = []
            tokens = []
            segment_ids = []
            tokens.append("[CLS]")
            segment_ids.append(0)
            orig_to_tok_map.append(len(tokens) - 1)
            for token in tokens_a:
                # orig_to_tok_map.append(len(tokens)) # keep first piece of tokenized term
                tokens.extend(tokenizer.tokenize(token))
                orig_to_tok_map.append(
                    len(tokens) - 1
                )  # # keep last piece of tokenized term -->> gives better results!
                segment_ids.append(0)
            tokens.append("[SEP]")
            segment_ids.append(0)
            orig_to_tok_map.append(len(tokens) - 1)
            input_ids = tokenizer.convert_tokens_to_ids(
                [tokens[i] for i in orig_to_tok_map]
            )

            # Convert data to InputExample format
            test_example = convert_text_to_examples(
                [sentence_ini], [["-PAD-"] * len(sentence_ini)]
            )

            # Convert to features
            (input_ids, input_masks, segment_ids, _) = convert_examples_to_features(
                tokenizer, test_example, max_seq_length=MAX_SEQUENCE_LENGTH + 2
            )

            with sess.as_default():
                with sess.graph.as_default():
                    predictions = model.predict(
                        [input_ids, input_masks, segment_ids], batch_size=1
                    ).argmax(-1)[0]

            pred_tuples += [
                (sentence_ini[i - 1], int2tag[pred])
                for i, pred in enumerate(predictions)
                if not i > len(sentence_ini) and pred != 0
            ]

        return pred_tuples

    MAX_SEQUENCE_LENGTH = 70

    # Initialize session
    tf_config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    tf_config.gpu_options.allow_growth = True
    # tf_config.intra_op_parallelism_threads = 2
    # tf_config.inter_op_parallelism_threads = 2
    sess = tf.compat.v1.Session(config=tf_config)
    initialize_vars(sess)

    tags = {
        "ADJ",
        "ADP",
        "ADV",
        "AUX",
        "CCONJ",
        "DET",
        "INTJ",
        "NOUN",
        "NUM",
        "PART",
        "PRON",
        "PROPN",
        "PUNCT",
        "SCONJ",
        "VERB",
        "X",
    }

    # label mappings
    tag2int = {}
    int2tag = {}

    for i, tag in enumerate(sorted(tags)):
        tag2int[tag] = i + 1
        int2tag[i + 1] = tag

    # Special character for the tags
    tag2int["-PAD-"] = 0
    int2tag[0] = "-PAD-"

    n_tags = len(tag2int)

    # graph = tf.get_default_graph()

    # Params for bert model and tokenization
    bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"  # use multi lang version!

    # Instantiate tokenizer
    tokenizer = create_tokenizer_from_hub_module(bert_path)

    model = build_model(MAX_SEQUENCE_LENGTH + 2)  # We sum 2 for [CLS], [SEP] tokens
    model.load_weights(MODEL_WEIGHTS_DIR)
    model._make_predict_function()