function_builder.py

# coding=utf-8
"""doc."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import functools
import sys
from tensorflow import print as tf_print
import os
import tensorflow as tf
import modeling
import xlnet
from tensorflow.contrib.layers.python.layers import initializers


def construct_scalar_host_call(
        monitor_dict,
        model_dir,
        prefix="",
        reduce_fn=None):
    """
    Construct host calls to monitor training progress on TPUs.
    """

    metric_names = list(monitor_dict.keys())

    def host_call_fn(global_step, *args):
        """actual host call function."""
        step = global_step[0]
        with tf.contrib.summary.create_file_writer(
                logdir=model_dir, filename_suffix=".host_call").as_default():
            with tf.contrib.summary.always_record_summaries():
                for i, name in enumerate(metric_names):
                    if reduce_fn is None:
                        scalar = args[i][0]
                    else:
                        scalar = reduce_fn(args[i])
                    with tf.contrib.summary.record_summaries_every_n_global_steps(
                            100, global_step=step):
                        tf.contrib.summary.scalar(prefix + name, scalar, step=step)

                return tf.contrib.summary.all_summary_ops()

    global_step_tensor = tf.reshape(tf.train.get_or_create_global_step(), [1])
    other_tensors = [tf.reshape(monitor_dict[key], [1]) for key in metric_names]

    return host_call_fn, [global_step_tensor] + other_tensors


def two_stream_loss(FLAGS, features, labels, mems, is_training):
    """Pretraining loss with two-stream attention Transformer-XL."""

    #### Unpack input
    mem_name = "mems"
    mems = mems.get(mem_name, None)

    inp_k = tf.transpose(features["input_k"], [1, 0])
    inp_q = tf.transpose(features["input_q"], [1, 0])

    seg_id = tf.transpose(features["seg_id"], [1, 0])

    inp_mask = None
    perm_mask = tf.transpose(features["perm_mask"], [1, 2, 0])

    if FLAGS.num_predict is not None:
        # [num_predict x tgt_len x bsz]
        target_mapping = tf.transpose(features["target_mapping"], [1, 2, 0])
    else:
        target_mapping = None

    # target for LM loss
    tgt = tf.transpose(features["target"], [1, 0])

    # target mask for LM loss
    tgt_mask = tf.transpose(features["target_mask"], [1, 0])

    # construct xlnet config and save to model_dir
    xlnet_config = xlnet.XLNetConfig(FLAGS=FLAGS)
    xlnet_config.to_json(os.path.join(FLAGS.model_dir, "config.json"))

    # construct run config from FLAGS
    run_config = xlnet.create_run_config(is_training, False, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp_k,
        seg_ids=seg_id,
        input_mask=inp_mask,
        mems=mems,
        perm_mask=perm_mask,
        target_mapping=target_mapping,
        inp_q=inp_q)

    output = xlnet_model.get_sequence_output()
    new_mems = {mem_name: xlnet_model.get_new_memory()}
    lookup_table = xlnet_model.get_embedding_table()

    initializer = xlnet_model.get_initializer()

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        # LM loss
        lm_loss = modeling.lm_loss(
            hidden=output,
            target=tgt,
            n_token=xlnet_config.n_token,
            d_model=xlnet_config.d_model,
            initializer=initializer,
            lookup_table=lookup_table,
            tie_weight=True,
            bi_data=run_config.bi_data,
            use_tpu=run_config.use_tpu)

    #### Quantity to monitor
    monitor_dict = {}

    if FLAGS.use_bfloat16:
        tgt_mask = tf.cast(tgt_mask, tf.float32)
        lm_loss = tf.cast(lm_loss, tf.float32)

    total_loss = tf.reduce_sum(lm_loss * tgt_mask) / tf.reduce_sum(tgt_mask)
    monitor_dict["total_loss"] = total_loss

    return total_loss, new_mems, monitor_dict


def get_loss(FLAGS, features, labels, mems, is_training):
    """Pretraining loss with two-stream attention Transformer-XL."""
    if FLAGS.use_bfloat16:
        with tf.tpu.bfloat16_scope():
            return two_stream_loss(FLAGS, features, labels, mems, is_training)
    else:
        return two_stream_loss(FLAGS, features, labels, mems, is_training)


def get_classification_loss(
        FLAGS, features, n_class, is_training):
    """Loss for downstream classification tasks."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):

        if FLAGS.cls_scope is not None and FLAGS.cls_scope:
            cls_scope = "classification_{}".format(FLAGS.cls_scope)
        else:
            cls_scope = "classification_{}".format(FLAGS.task_name.lower())

        per_example_loss, logits = modeling.classification_loss(
            hidden=summary,
            labels=label,
            n_class=n_class,
            initializer=xlnet_model.get_initializer(),
            scope=cls_scope,
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits


def get_regression_loss(
        FLAGS, features, is_training):
    """Loss for downstream regression tasks."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)

    summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)

    with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
        per_example_loss, logits = modeling.regression_loss(
            hidden=summary,
            labels=label,
            initializer=xlnet_model.get_initializer(),
            scope="regression_{}".format(FLAGS.task_name.lower()),
            return_logits=True)

        total_loss = tf.reduce_mean(per_example_loss)

        return total_loss, per_example_loss, logits


def get_qa_outputs(FLAGS, features, is_training):
    """Loss for downstream span-extraction QA tasks such as SQuAD."""

    inp = tf.transpose(features["input_ids"], [1, 0])
    seg_id = tf.transpose(features["segment_ids"], [1, 0])
    inp_mask = tf.transpose(features["input_mask"], [1, 0])
    cls_index = tf.reshape(features["cls_index"], [-1])

    seq_len = tf.shape(inp)[0]

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    output = xlnet_model.get_sequence_output()
    initializer = xlnet_model.get_initializer()

    return_dict = {}

    # invalid position mask such as query and special symbols (PAD, SEP, CLS)
    p_mask = features["p_mask"]

    # logit of the start position
    with tf.variable_scope("start_logits"):
        start_logits = tf.layers.dense(
            output,
            1,
            kernel_initializer=initializer)
        start_logits = tf.transpose(tf.squeeze(start_logits, -1), [1, 0])
        start_logits_masked = start_logits * (1 - p_mask) - 1e30 * p_mask
        start_log_probs = tf.nn.log_softmax(start_logits_masked, -1)

    # logit of the end position
    with tf.variable_scope("end_logits"):
        if is_training:
            # during training, compute the end logits based on the
            # ground truth of the start position

            start_positions = tf.reshape(features["start_positions"], [-1])
            start_index = tf.one_hot(start_positions, depth=seq_len, axis=-1,
                                     dtype=tf.float32)
            start_features = tf.einsum("lbh,bl->bh", output, start_index)
            start_features = tf.tile(start_features[None], [seq_len, 1, 1])
            end_logits = tf.layers.dense(
                tf.concat([output, start_features], axis=-1), xlnet_config.d_model,
                kernel_initializer=initializer, activation=tf.tanh, name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(
                end_logits, begin_norm_axis=-1)

            end_logits = tf.layers.dense(
                end_logits, 1,
                kernel_initializer=initializer,
                name="dense_1")
            end_logits = tf.transpose(tf.squeeze(end_logits, -1), [1, 0])
            end_logits_masked = end_logits * (1 - p_mask) - 1e30 * p_mask
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
        else:
            # during inference, compute the end logits based on beam search

            start_top_log_probs, start_top_index = tf.nn.top_k(
                start_log_probs, k=FLAGS.start_n_top)
            start_index = tf.one_hot(start_top_index,
                                     depth=seq_len, axis=-1, dtype=tf.float32)
            start_features = tf.einsum("lbh,bkl->bkh", output, start_index)
            end_input = tf.tile(output[:, :, None],
                                [1, 1, FLAGS.start_n_top, 1])
            start_features = tf.tile(start_features[None],
                                     [seq_len, 1, 1, 1])
            end_input = tf.concat([end_input, start_features], axis=-1)
            end_logits = tf.layers.dense(
                end_input,
                xlnet_config.d_model,
                kernel_initializer=initializer,
                activation=tf.tanh,
                name="dense_0")
            end_logits = tf.contrib.layers.layer_norm(end_logits,
                                                      begin_norm_axis=-1)
            end_logits = tf.layers.dense(
                end_logits,
                1,
                kernel_initializer=initializer,
                name="dense_1")
            end_logits = tf.reshape(end_logits, [seq_len, -1, FLAGS.start_n_top])
            end_logits = tf.transpose(end_logits, [1, 2, 0])
            end_logits_masked = end_logits * (
                    1 - p_mask[:, None]) - 1e30 * p_mask[:, None]
            end_log_probs = tf.nn.log_softmax(end_logits_masked, -1)
            end_top_log_probs, end_top_index = tf.nn.top_k(
                end_log_probs, k=FLAGS.end_n_top)
            end_top_log_probs = tf.reshape(
                end_top_log_probs,
                [-1, FLAGS.start_n_top * FLAGS.end_n_top])
            end_top_index = tf.reshape(
                end_top_index,
                [-1, FLAGS.start_n_top * FLAGS.end_n_top])

    if is_training:
        return_dict["start_log_probs"] = start_log_probs
        return_dict["end_log_probs"] = end_log_probs
    else:
        return_dict["start_top_log_probs"] = start_top_log_probs
        return_dict["start_top_index"] = start_top_index
        return_dict["end_top_log_probs"] = end_top_log_probs
        return_dict["end_top_index"] = end_top_index

    # an additional layer to predict answerability
    with tf.variable_scope("answer_class"):
        # get the representation of CLS
        cls_index = tf.one_hot(cls_index, seq_len, axis=-1, dtype=tf.float32)
        cls_feature = tf.einsum("lbh,bl->bh", output, cls_index)

        # get the representation of START
        start_p = tf.nn.softmax(start_logits_masked, axis=-1,
                                name="softmax_start")
        start_feature = tf.einsum("lbh,bl->bh", output, start_p)

        # note(zhiliny): no dependency on end_feature so that we can obtain
        # one single `cls_logits` for each sample
        ans_feature = tf.concat([start_feature, cls_feature], -1)
        ans_feature = tf.layers.dense(
            ans_feature,
            xlnet_config.d_model,
            activation=tf.tanh,
            kernel_initializer=initializer, name="dense_0")
        ans_feature = tf.layers.dropout(ans_feature, FLAGS.dropout,
                                        training=is_training)
        cls_logits = tf.layers.dense(
            ans_feature,
            1,
            kernel_initializer=initializer,
            name="dense_1",
            use_bias=False)
        cls_logits = tf.squeeze(cls_logits, -1)

        return_dict["cls_logits"] = cls_logits

    return return_dict


def get_ner_loss(FLAGS, features, is_training):  # , lengths):
    """Loss for downstream sequence labelling such as NER."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    input_ids = features["input_ids"]
    print("&&&&&&&&&&%%%%%%%%% the input_ids shape is ", input_ids.shape)
    used = tf.sign(tf.abs(input_ids))
    print("&&&&&&&&&&%%%%%%%%% the used shape is ", used.shape)
    # [batch_size] 大小的向量，包含了当前batch中的序列长度
    lengths = tf.reduce_sum(used, reduction_indices=1)
    print("&&&&&&&&&&%%%%%%%%% lengths shape is ", lengths.shape)

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 1, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 1])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    labels = tf.reshape(features["label_ids"], [bsz_per_core, FLAGS.max_seq_length])
    print("&&&&&&&&&&%%%%%%%%% labels shape is ", labels.shape)

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    embedded_chars = xlnet_model.get_sequence_output()
    embedding_dims = embedded_chars.shape[-1]
    num_labels = 17
    with tf.variable_scope("logits", reuse=tf.AUTO_REUSE):
        W = tf.get_variable("W", shape=[embedding_dims, num_labels],
                            dtype=tf.float32, initializer=initializers.xavier_initializer())

        b = tf.get_variable("b", shape=[num_labels], dtype=tf.float32,
                            initializer=tf.zeros_initializer())
        x = tf.reshape(embedded_chars, shape=[-1, embedding_dims])  # [batch_size, embedding_dims]
        pred = tf.nn.xw_plus_b(x, W, b)
        print("&&&&&&&&&&%%%%%%%%% the embedded_chars shape is ", embedded_chars.shape)
        print("&&&&&&&&&&%%%%%%%%% W shape is ", W.shape)
        print("&&&&&&&&&&%%%%%%%%% b shape is ", b.shape)
        print("&&&&&&&&&&%%%%%%%%% output shape is ", x.shape)
        print("&&&&&&&&&&%%%%%%%%% pred shape is ", pred.shape)
        logits = tf.reshape(pred, [-1, FLAGS.max_seq_length, num_labels])
        print("&&&&&&&&&&%%%%%%%%% logits shape is ", logits.shape)
        # trans1 = tf.get_variable(
        #     "transitions",
        #     shape=[num_labels, num_labels],
        #     initializer=initializers.xavier_initializer())
        # crf
        log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
            inputs=logits,
            tag_indices=labels,
            sequence_lengths=lengths)
        # log_likelihood, trans = tf.contrib.crf.crf_log_likelihood(
        #     inputs=logits,
        #     tag_indices=labels,
        #     transition_params=trans1,
        #     sequence_lengths=lengths)
        # CRF decode, pred_ids 是一条最大概率的标注路径
        pred_ids, _ = tf.contrib.crf.crf_decode(potentials=logits,
                                                transition_params=trans,
                                                sequence_length=lengths)
        # return (loss, logits, trans, pred_ids)
        one_hot_target = tf.one_hot(labels, num_labels)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)
    return total_loss, per_example_loss, logits


def get_race_loss(FLAGS, features, is_training):
    """Loss for downstream multi-choice QA tasks such as RACE."""

    bsz_per_core = tf.shape(features["input_ids"])[0]

    def _transform_features(feature):
        out = tf.reshape(feature, [bsz_per_core, 4, -1])
        out = tf.transpose(out, [2, 0, 1])
        out = tf.reshape(out, [-1, bsz_per_core * 4])
        return out

    inp = _transform_features(features["input_ids"])
    seg_id = _transform_features(features["segment_ids"])
    inp_mask = _transform_features(features["input_mask"])
    label = tf.reshape(features["label_ids"], [bsz_per_core])

    xlnet_config = xlnet.XLNetConfig(json_path=FLAGS.model_config_path)
    run_config = xlnet.create_run_config(is_training, True, FLAGS)

    xlnet_model = xlnet.XLNetModel(
        xlnet_config=xlnet_config,
        run_config=run_config,
        input_ids=inp,
        seg_ids=seg_id,
        input_mask=inp_mask)
    summary = xlnet_model.get_pooled_out(FLAGS.summary_type, FLAGS.use_summ_proj)

    with tf.variable_scope("logits"):
        logits = tf.layers.dense(summary, 1,
                                 kernel_initializer=xlnet_model.get_initializer())
        logits = tf.reshape(logits, [bsz_per_core, 4])

        one_hot_target = tf.one_hot(label, 4)
        per_example_loss = -tf.reduce_sum(
            tf.nn.log_softmax(logits) * one_hot_target, -1)
        total_loss = tf.reduce_mean(per_example_loss)

    return total_loss, per_example_loss, logits