mindformers.pipeline.token_classification_pipeline 源代码

# Copyright 2023 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file was refer to project:
# https://github.com/lonePatient/daguan_2019_rank9/blob/master/pydatagrand/train/ner_utils.py
# ============================================================================

"""TokenClassificationPipeline"""
import os.path

import numpy as np
from mindspore import ops, Model
from ..auto_class import AutoProcessor, AutoModel
from ..mindformer_book import MindFormerBook
from .base_pipeline import BasePipeline
from ..tools.register import MindFormerRegister, MindFormerModuleType
from ..models import BaseModel, Tokenizer

__all__ = ['TokenClassificationPipeline']

[文档]@MindFormerRegister.register(MindFormerModuleType.PIPELINE, alias="token_classification") class TokenClassificationPipeline(BasePipeline): r"""Pipeline for token classification Args: model (Union[str, BaseModel]): The model used to perform task, the input could be a supported model name, or a model instance inherited from BaseModel. tokenizer : a tokenizer (None or Tokenizer) for text processing. id2label : a dict which maps label id to label str. Raises: TypeError: If input model and image_processor's types are not corrected. ValueError: If the input model is not in support list. Examples: >>> from mindformers.pipeline import TokenClassificationPipeline >>> from mindformers import AutoTokenizer, BertForTokenClassification, AutoConfig >>> from mindformers.dataset.labels import cluener_labels >>> id2label = {label_id: label for label_id, label in enumerate(cluener_labels)} >>> input_data = ["表身刻有代表日内瓦钟表匠freresoltramare的“fo”字样。"] >>> tokenizer = AutoTokenizer.from_pretrained('tokcls_bert_base_chinese_cluener') >>> ner_dense_cluener_config = AutoConfig.from_pretrained('tokcls_bert_base_chinese_cluener') >>> model = BertForTokenClassification(ner_dense_cluener_config) >>> tokcls_pipeline = TokenClassificationPipeline(task='token_classification', ... model=model, ... id2label=id2label, ... tokenizer=tokenizer, ... max_length=model.config.seq_length, ... padding="max_length") >>> results = tokcls_pipeline(input_data) >>> print(results) [[{'entity_group': 'address', 'start': 6, 'end': 8, 'score': 0.52329, 'word': '日内瓦'}, {'entity_group': 'name', 'start': 12, 'end': 25, 'score': 0.83922, 'word': 'freresoltramar'}]] """ _support_list = MindFormerBook.get_pipeline_support_task_list()['token_classification'].keys() def __init__(self, model, id2label, tokenizer=None, **kwargs): if isinstance(model, str): if model in self._support_list or os.path.isdir(model): if tokenizer is None: tokenizer = AutoProcessor.from_pretrained(model).tokenizer model = AutoModel.from_pretrained(model) if not isinstance(tokenizer, Tokenizer): raise TypeError(f"tokenizer should be inherited from" f" BaseTokenizer, but got {type(tokenizer)}.") else: raise ValueError(f"{model} is not supported by {self.__class__.__name__}," f"please selected from {self._support_list}.") if not isinstance(model, (BaseModel, Model)): raise TypeError(f"model should be inherited from BaseModel or Model, but got type {type(model)}.") if tokenizer is None: raise ValueError(f"{self.__class__.__name__}" " requires for a tokenizer.") if id2label is None: raise ValueError(f"{self.__class__.__name__}" " requires for a dict which maps label id to label str.") self.id2label = id2label self.input_text = "" super().__init__(model, tokenizer, **kwargs) def _sanitize_parameters(self, **pipeline_parameters): """sanitize parameters for preprocess, forward, and postprocess.""" if 'batch_size' in pipeline_parameters: raise ValueError(f"The {self.__class__.__name__} does not support batch inference, please remove the " f"batch_size") postprocess_params = {'id2label'} preprocess_key_name = ['max_length', 'padding'] preprocess_params = {k: v for k, v in pipeline_parameters.items() if k in preprocess_key_name} postprocess_params = {k: v for k, v in pipeline_parameters.items() if k in postprocess_params} return preprocess_params, {}, postprocess_params
[文档] def preprocess(self, inputs, **preprocess_params): """ Preprocess of token classification Args: inputs (str): the str to be classified. max_length (int): max length of tokenizer's output padding (False / "max_length"): padding for max_length return_tensors ("ms"): the type of returned tensors Return: processed text. """ if not isinstance(inputs, str): raise ValueError("Inputs type must be str") self.input_text = inputs max_length = preprocess_params.pop("max_length", 128) padding = preprocess_params.pop("padding", "max_length") inputs = self.tokenizer(inputs, max_length=max_length, padding=padding, return_tensors="ms", **preprocess_params) expand_dims = ops.ExpandDims() return {"input_ids": expand_dims(inputs["input_ids"], 0), "input_mask": expand_dims(inputs["attention_mask"], 0), "token_type_ids": expand_dims(inputs["token_type_ids"], 0)}
[文档] def forward(self, model_inputs, **forward_params): """ Forward process Args: model_inputs (dict): outputs of preprocess. Return: probs dict. """ self.model.set_train(False) logits = self.network(**model_inputs) return {"logits": logits}
[文档] def postprocess(self, model_outputs, **postprocess_params): """ Postprocess Args: model_outputs (dict): outputs of forward process. Return: The generated results """ logits = model_outputs["logits"].asnumpy() maxes = np.max(logits, axis=-1, keepdims=True) shifted_exp = np.exp(logits - maxes) probs = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) batch_pred_ids = np.argmax(probs, axis=2).tolist() batch_best_scores = np.max(probs, axis=2).tolist() # remove CLS and SEP token pred_paths = [[self.id2label[id_] for id_ in pred_ids[1:-1]] for pred_ids in batch_pred_ids] best_scores = [best_scores[1:-1] for best_scores in batch_best_scores] total_result = [] for pred_path, best_score in zip(pred_paths, best_scores): single_result = [] pred_entities = self.get_entities_bios(pred_path) for pred_entity in pred_entities: entity_result = {} entity_result["entity_group"] = pred_entity[0] entity_result["start"] = pred_entity[1] entity_result["end"] = pred_entity[2] entity_result["score"] = sum(best_score[entity_result["start"]:entity_result["end"] + 1]) / \ (entity_result["end"] + 1 - entity_result["start"]) entity_result["score"] = round(entity_result["score"], 5) entity_result["word"] = self.input_text[entity_result["start"]:entity_result["end"] + 1] single_result.append(entity_result) total_result.append(single_result) return total_result
[文档] def get_entities_bios(self, seq): """Gets entities from sequence. note: BIOS Args: seq (list): sequence of labels. Returns: list: list of (chunk_type, chunk_start, chunk_end). Example: # >>> seq = ['B-PER', 'I-PER', 'O', 'S-LOC'] # >>> get_entity_bios(seq) [['PER', 0, 1], ['LOC', 3, 3]] """ chunks = [] chunk = [-1, -1, -1] for indx, tag in enumerate(seq): if tag.startswith("S-"): if chunk[2] != -1: chunks.append(chunk) chunk = [-1, -1, -1] chunk[1] = indx chunk[2] = indx chunk[0] = tag.split('-')[1] chunks.append(chunk) chunk = [-1, -1, -1] if tag.startswith("B-"): if chunk[2] != -1: chunks.append(chunk) chunk = [-1, -1, -1] chunk[1] = indx chunk[0] = tag.split('-')[1] elif tag.startswith('I-') and chunk[1] != -1: entity_type = tag.split('-')[1] if entity_type == chunk[0]: chunk[2] = indx if indx == len(seq) - 1: chunks.append(chunk) else: if chunk[2] != -1: chunks.append(chunk) chunk = [-1, -1, -1] return chunks