mindformers.pipeline.text_classification_pipeline 源代码

# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""TextClassificationPipeline"""
import os.path

import numpy as np
from mindspore import ops, Tensor, Model
from ..auto_class import AutoProcessor, AutoModel
from ..mindformer_book import MindFormerBook
from .base_pipeline import BasePipeline
from ..tools.register import MindFormerRegister, MindFormerModuleType
from ..models import BaseModel, Tokenizer
from ..dataset.labels import mnli_labels

__all__ = ['TextClassificationPipeline']

[文档]@MindFormerRegister.register(MindFormerModuleType.PIPELINE, alias="text_classification")
class TextClassificationPipeline(BasePipeline):
    """Pipeline for text classification

    Args:
        model (Union[str, BaseModel]): The model used to perform task,
            the input could be a supported model name, or a model instance
            inherited from BaseModel.
        tokenizer : a tokenizer (None or Tokenizer) for text processing.

    Raises:
        TypeError: If input model and image_processor's types are not corrected.
        ValueError: If the input model is not in support list.

    Examples:
        >>> from mindformers.pipeline import TextClassificationPipeline
        >>> from mindformers import AutoTokenizer, BertForMultipleChoice, AutoConfig
        >>> input_data = ["The new rights are nice enough-Everyone really likes the newest benefits ",
        ...               "i don't know um do you do a lot of camping-I know exactly."]
        >>> tokenizer = AutoTokenizer.from_pretrained('txtcls_bert_base_uncased_mnli')
        >>> txtcls_mnli_config = AutoConfig.from_pretrained('txtcls_bert_base_uncased_mnli')
        >>> model = BertForMultipleChoice(txtcls_mnli_config)
        >>> txtcls_pipeline = TextClassificationPipeline(task='text_classification',
        ...                                              model=model,
        ...                                              tokenizer=tokenizer,
        ...                                              max_length=model.config.seq_length,
        ...                                              padding="max_length")
        >>> results = txtcls_pipeline(input_data, top_k=1)
        >>> print(results)
            [[{'label': 'neutral', 'score': 0.9714198708534241}],
            [{'label': 'contradiction', 'score': 0.9967639446258545}]]
    """
    _support_list = MindFormerBook.get_pipeline_support_task_list()['text_classification'].keys()

    def __init__(self, model, tokenizer=None, **kwargs):
        if isinstance(model, str):
            if model in self._support_list or os.path.isdir(model):
                if tokenizer is None:
                    tokenizer = AutoProcessor.from_pretrained(model).tokenizer
                model = AutoModel.from_pretrained(model)
                if not isinstance(tokenizer, Tokenizer):
                    raise TypeError(f"tokenizer should be inherited from"
                                    f" BaseTokenizer, but got {type(tokenizer)}.")
            else:
                raise ValueError(f"{model} is not supported by {self.__class__.__name__},"
                                 f"please selected from {self._support_list}.")

        if not isinstance(model, (BaseModel, Model)):
            raise TypeError(f"model should be inherited from BaseModel or Model, but got type {type(model)}.")

        if tokenizer is None:
            raise ValueError(f"{self.__class__.__name__}"
                             " requires for a tokenizer.")

        super().__init__(model, tokenizer, **kwargs)

    def _sanitize_parameters(self, **pipeline_parameters):
        """sanitize parameters for preprocess, forward, and postprocess."""
        if 'batch_size' in pipeline_parameters:
            raise ValueError(f"The {self.__class__.__name__} does not support batch inference, please remove the "
                             f"batch_size")

        postprocess_params = {}

        forward_key_name = ['top_k', 'top_p', 'do_sample', 'eos_token_id', 'repetition_penalty', 'max_length']
        forward_kwargs = {}
        for item in forward_key_name:
            if item in pipeline_parameters:
                forward_kwargs[item] = pipeline_parameters.get(item)

        preprocess_key_name = ['top_k', 'top_p', 'do_sample', 'eos_token_id', 'repetition_penalty', 'max_length',
                               'padding']
        preprocess_params = {k: v for k, v in pipeline_parameters.items() if k in preprocess_key_name}

        if "top_k" in pipeline_parameters:
            postprocess_params["top_k"] = pipeline_parameters.get("top_k")
        return preprocess_params, forward_kwargs, postprocess_params

[文档]    def inputs_process(self, inputs_zero, inputs_one):
        """
        process of two sentences relationship classification

        Args:
            inputs_zero (str): the first sentence
            inputs_one (str): the second sentence

        Return:
            processed inputs, mask, token_type about two sentences
        """
        len_inputs = len(inputs_zero["input_ids"])
        inputs_zero_input = list(inputs_zero["input_ids"].asnumpy())
        inputs_one_input = list(inputs_one["input_ids"].asnumpy())
        inputs_zero_input = [x for x in inputs_zero_input if x != 0]
        inputs_one_input = [x for x in inputs_one_input if x != 0]
        token_type = [0] * len(inputs_zero_input) + [1] * (len(inputs_one_input) - 1)
        token_type = token_type + [0] * (len_inputs - len(token_type))
        inputs = inputs_zero_input + inputs_one_input[1:]
        len_inputs_mask = len(inputs)
        inputs = inputs + [0] * (len_inputs - len(inputs))
        mask = [1] * len_inputs_mask + [0] * (len_inputs - len_inputs_mask)
        return inputs, mask, token_type

[文档]    def preprocess(self, inputs, **preprocess_params):
        """
        Preprocess of text classification

        Args:
            inputs (str): the str to be classified.
            max_length (int): max length of tokenizer's output
            padding (False / "max_length"): padding for max_length
            return_tensors ("ms"): the type of returned tensors

        Return:
            processed text.
        """
        if not isinstance(inputs, str):
            raise ValueError("Inputs type must be str")
        if '-' not in inputs:
            raise ValueError("two texts of text pair should be split by -")
        inputs = inputs.split('-')
        max_length = preprocess_params.pop("max_length", 128)
        padding = preprocess_params.pop("padding", "max_length")
        inputs_zero = self.tokenizer(inputs[0], max_length=max_length, padding=padding,
                                     return_tensors="ms", **preprocess_params)
        inputs_one = self.tokenizer(inputs[1], max_length=max_length, padding=padding,
                                    return_tensors="ms", **preprocess_params)
        inputs, mask, token_type = self.inputs_process(inputs_zero, inputs_one)
        inputs_final = {}
        inputs_final["input_ids"] = Tensor.from_numpy(np.array(inputs, dtype=np.int32))
        inputs_final["attention_mask"] = Tensor.from_numpy(np.array(mask, dtype=np.int32))
        inputs_final["token_type_ids"] = Tensor.from_numpy(np.array(token_type, dtype=np.int32))
        expand_dims = ops.ExpandDims()
        return {"input_ids": expand_dims(inputs_final["input_ids"], 0),
                "input_mask": expand_dims(inputs_final["attention_mask"], 0),
                "token_type_id": expand_dims(inputs_final["token_type_ids"], 0),
                "label_ids": None}

[文档]    def forward(self, model_inputs, **forward_params):
        """
        Forward process

        Args:
            model_inputs (dict): outputs of preprocess.

        Return:
            probs dict.
        """
        forward_params.pop("None", None)
        output_ids = self.network(**model_inputs)
        return output_ids

    def softmax(self, outputs):
        maxes = np.max(outputs, axis=-1, keepdims=True)
        shifted_exp = np.exp(outputs - maxes)
        return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)

[文档]    def postprocess(self, model_outputs, **postprocess_params):
        """
        Postprocess

        Args:
            model_outputs (dict): outputs of forward process.
            top_k (int): Return top_k probs of result

        Return:
            Classification results
        """
        top_k = postprocess_params.pop("top_k", None)
        id2label = {id: label for id, label in enumerate(mnli_labels)}
        outputs = model_outputs[0]
        outputs = outputs.asnumpy()
        scores = self.softmax(outputs)

        dict_scores = [
            {"label": id2label[i], "score": score.item()} for i, score in enumerate(scores)
        ]

        dict_scores.sort(key=lambda x: x["score"], reverse=True)
        if top_k is not None:
            dict_scores = dict_scores[:top_k]
        return [dict_scores]