mindformers.pipeline.fill_mask_pipeline 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""TranslationPipeline"""
import os.path
# from tkinter import _Padding

import numpy as np
import mindspore
from mindspore import ops, Tensor, Model
from ..auto_class import AutoProcessor, AutoModel
from ..mindformer_book import MindFormerBook
from .base_pipeline import BasePipeline
from ..tools.register import MindFormerRegister, MindFormerModuleType
from ..models import BaseModel, Tokenizer

__all__ = ['FillMaskPipeline']

[文档]@MindFormerRegister.register(MindFormerModuleType.PIPELINE, alias="fill_mask") class FillMaskPipeline(BasePipeline): """ Pipeline for mask fill Args: model: a pretrained model (str or BaseModel) in _supproted_list. tokenizer : a tokenizer (None or Tokenizer) for text processing """ _support_list = MindFormerBook.get_model_support_list()['bert'] return_name = 'fillmask' def __init__(self, model, tokenizer=None, **kwargs): if isinstance(model, str): if model in self._support_list or os.path.isdir(model): if tokenizer is None: tokenizer = AutoProcessor.from_pretrained(model).tokenizer model = AutoModel.from_pretrained(model) if not isinstance(tokenizer, Tokenizer): raise TypeError(f"tokenizer should be inherited from" f" PretrainedTokenizer, but got {type(tokenizer)}.") else: raise ValueError(f"{model} is not supported by {self.__class__.__name__}," f"please selected from {self._support_list}.") if not isinstance(model, (BaseModel, Model)): raise TypeError(f"model should be inherited from BaseModel or Model, but got type {type(model)}.") if tokenizer is None: raise ValueError(f"{self.__class__.__name__}" " requires for a tokenizer.") self.input_text = "" super().__init__(model, tokenizer, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): """sanitize parameters for preprocess, forward, and postprocess.""" if 'batch_size' in pipeline_parameters: raise ValueError(f"The {self.__class__.__name__} does not support batch inference, please remove the " f"batch_size") postprocess_params = {} forward_key_name = ['top_k', 'top_p', 'do_sample', 'eos_token_id', 'repetition_penalty', 'max_length'] forward_kwargs = {} for item in forward_key_name: if item in pipeline_parameters: forward_kwargs[item] = pipeline_parameters.get(item) preprocess_key_name = ['top_k', 'top_p', 'do_sample', 'eos_token_id', 'repetition_penalty', 'max_length', 'padding'] preprocess_params = {k: v for k, v in pipeline_parameters.items() if k in preprocess_key_name} return preprocess_params, forward_kwargs, postprocess_params
[文档] def preprocess(self, inputs, **preprocess_params): """ Preprocess of mask fill Args: inputs (url, PIL.Image, tensor, numpy): the image to be classified. max_length (int): max length of tokenizer's output padding (False / "max_length"): padding for max_length return_tensors ("ms"): the type of returned tensors Return: processed image. """ if isinstance(inputs, dict): inputs = inputs['text'] if isinstance(inputs, mindspore.Tensor): inputs = inputs.asnumpy().tolist() self.input_text = inputs if isinstance(inputs, str) else "" max_length = preprocess_params.pop("max_length", 128) padding = preprocess_params.pop("padding", "max_length") inputs = self.tokenizer(inputs, max_length=max_length, padding=padding, return_tensors="ms", **preprocess_params) expand_dims = ops.ExpandDims() return {"input_ids": expand_dims(inputs["input_ids"], 0), "input_mask": expand_dims(inputs["attention_mask"], 0), "token_type_id": expand_dims(inputs["token_type_ids"], 0), "masked_lm_positions": expand_dims(Tensor(self.tokenizer.mask_index), 0)}
[文档] def forward(self, model_inputs, **forward_params): """ Forward process Args: model_inputs (dict): outputs of preprocess. Return: probs dict. """ forward_params.pop("None", None) output_ids = self.network(**model_inputs) return {"output_ids": output_ids}
[文档] def postprocess(self, model_outputs, **postprocess_params): """ Postprocess Args: model_outputs (dict): outputs of forward process. Return: The generated results """ outputs = model_outputs["output_ids"][-2].asnumpy() tokens_dict = [] max_tokens = np.argmax(outputs, axis=1) for ind, tokenid in enumerate(max_tokens): token = self.tokenizer.decode([int(tokenid),], skip_special_tokens=True) token = token.replace(' ', '') tokens_dict.append({'score': outputs[ind, tokenid], 'token': tokenid, 'token_str': token}) self.input_text = self.input_text.replace('[MASK]', token, 1) tokens_dict.append({'sequence': self.input_text}) return [tokens_dict,]