mindformers.models.clip.clip_processor 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""
CLIPProcessor
"""
from typing import Optional, Union, List
import numpy as np
import PIL

import mindspore as ms

from mindformers.dataset import (
    BCHW2BHWC, BatchResize, BatchToTensor,
    BatchNormalize, BatchCenterCrop, BatchPILize
)
from mindformers.mindformer_book import MindFormerBook
from ..base_processor import BaseImageProcessor
from ..base_processor import BaseProcessor
from ...tools.register import MindFormerRegister, MindFormerModuleType


[文档]@MindFormerRegister.register(MindFormerModuleType.PROCESSOR) class CLIPImageProcessor(BaseImageProcessor): """ CLIPImageProcessor. Args: image_resolution (int): The target size. Examples: >>> from mindformers import CLIPImageProcessor >>> from mindformers.tools.image_tools import load_image >>> processor = CLIPImageProcessor(image_resolution=256) >>> image = load_image("https://ascend-repo-modelzoo.obs.cn-east-2." ... "myhuaweicloud.com/XFormer_for_mindspore/clip/sunflower.png") >>> processor(image) Tensor(shape=[1, 3, 256, 256], dtype=Float32, value= [[[[-1.52949083e+000, -1.52949083e+000, ... -1.48569560e+000, -1.50029397e+000], [-1.52949083e+000, -1.52949083e+000, ... -1.50029397e+000, -1.50029397e+000], [-1.51489246e+000, -1.51489246e+000, ... -1.48569560e+000, -1.48569560e+000], ... ... [8.66091192e-001, 8.80311251e-001, ... -1.36645925e+000, -1.45177972e+000], [8.09210956e-001, 8.23431015e-001, ... -1.29535890e+000, -1.43755960e+000], [7.09670484e-001, 7.94990897e-001, ... -1.26691878e+000, -1.42333949e+000]]]]) """ def __init__(self, image_resolution: Optional[int] = 224): super(CLIPImageProcessor, self).__init__( image_resolution=image_resolution) self.bchw2bhwc = BCHW2BHWC() self.batch_pilizer = BatchPILize() self.batch_resizer = BatchResize(image_resolution) self.batch_crop = BatchCenterCrop(image_resolution) self.batch_totensor = BatchToTensor() self.batch_normalizer = BatchNormalize()
[文档] def preprocess(self, images: Union[ms.Tensor, PIL.Image.Image, np.ndarray, List[PIL.Image.Image]], **kwargs): r""" Preprocess Required By Base Processor. Args: images (ms.Tensor, PIL.Image, numpy.array, List[PIL.Image]): A batch of images. Return: A 4-rank tensor for a batch of images. """ if not self._bhwc_check(images): images = self.bchw2bhwc(images) images = self.batch_pilizer(images) images = self.batch_resizer(images) images = self.batch_crop(images) images = self.batch_totensor(images) images = self.batch_normalizer(images) kwargs.pop("other", None) if isinstance(images, list): return ms.Tensor(np.row_stack([np.expand_dims(item, axis=0) for item in images])) if len(images.shape) == 4: return ms.Tensor(images) return ms.Tensor(np.expand_dims(images, axis=0))
def _bhwc_check(self, image_batch: Union[ms.Tensor, PIL.Image.Image, np.ndarray, List[PIL.Image.Image]]): r"""Bhwc_check""" if isinstance(image_batch, np.ndarray): if image_batch.shape[-1] == 3: return True if isinstance(image_batch, ms.Tensor): if image_batch.asnumpy().shape[-1] == 3: return True if isinstance(image_batch, (list, PIL.Image.Image)): return True return False
[文档]@MindFormerRegister.register(MindFormerModuleType.PROCESSOR) class CLIPProcessor(BaseProcessor): r"""CLIP Processor, consists of a feature extractor (BaseFeatureEXtractor) for image input, and a tokenizer (BaseTokenizer) for text input. Args: image_processor (BaseImageProcessor): Used for process image data. tokenizer (BaseTokenizer): Used for process text data. max_length (Optional[int]): The length of text tokens. padding (Optional[str]): The padding strategy of tokenizer, [None, "max_length"]. return_tensors (Optional[str]): The type of returned tensors for tokenizer, [None, "ms"]. Examples: >>> from mindformers import CLIPProcessor >>> from mindformers.tools.image_tools import load_image >>> image = load_image("https://ascend-repo-modelzoo.obs.cn-east-2." ... "myhuaweicloud.com/XFormer_for_mindspore/clip/sunflower.png") >>> text = ["a boy", "a girl"] >>> CLIPProcessor.show_support_list() INFO - support list of CLIP Processor is: INFO - ['clip_vit_b_32'] INFO - ------------------------------------- >>> processor = CLIPProcessor.from_pretrained('clip_vit_b_32') >>> processor(image, text) {'image': Tensor(shape=[1, 3, 224, 224], dtype=Float32, value= [[[[-1.52949083e+000, -1.52949083e+000,... -1.48569560e+000, -1.50029397e+000], [-1.52949083e+000, -1.52949083e+000, ... -1.50029397e+000, -1.50029397e+000], [-1.50029397e+000, -1.50029397e+000 ... -1.48569560e+000, -1.50029397e+000], ... [8.23431015e-001, 8.80311251e-001, ... -1.33801913e+000, -1.43755960e+000], [7.80770779e-001, 8.37651074e-001, ... -1.23847866e+000, -1.39489937e+000], [6.10130012e-001, 7.66550720e-001, ... -1.19581854e+000, -1.38067937e+000]]]]), 'text': Tensor(shape=[2, 77], dtype=Int32, value= [[49406, 320, 1876 ... 0, 0, 0], [49406, 320, 1611 ... 0, 0, 0]])} """ _support_list = MindFormerBook.get_processor_support_list()['clip'] def __init__(self, image_processor, tokenizer, max_length=77, padding='max_length', return_tensors='ms'): super(CLIPProcessor, self).__init__( image_processor=image_processor, tokenizer=tokenizer, max_length=max_length, padding=padding, return_tensors=return_tensors)