mindformers.models.clip.clip_processor 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""
CLIPProcessor
"""
from typing import Optional, Union, List
import numpy as np
import PIL

import mindspore as ms

from mindformers.dataset import (
    BCHW2BHWC, BatchResize, BatchToTensor,
    BatchNormalize, BatchCenterCrop, BatchPILize
)
from mindformers.mindformer_book import MindFormerBook
from ..base_processor import BaseImageProcessor
from ..base_processor import BaseProcessor
from ...tools.register import MindFormerRegister, MindFormerModuleType


[文档]@MindFormerRegister.register(MindFormerModuleType.PROCESSOR)
class CLIPImageProcessor(BaseImageProcessor):
    """
    CLIPImageProcessor.

    Args:
        image_resolution (int): The target size.

    Examples:
        >>> from mindformers import CLIPImageProcessor
        >>> from mindformers.tools.image_tools import load_image
        >>> processor = CLIPImageProcessor(image_resolution=256)
        >>> image = load_image("https://ascend-repo-modelzoo.obs.cn-east-2."
        ...                    "myhuaweicloud.com/XFormer_for_mindspore/clip/sunflower.png")
        >>> processor(image)
            Tensor(shape=[1, 3, 256, 256], dtype=Float32, value=
            [[[[-1.52949083e+000, -1.52949083e+000, ... -1.48569560e+000, -1.50029397e+000],
            [-1.52949083e+000, -1.52949083e+000, ... -1.50029397e+000, -1.50029397e+000],
            [-1.51489246e+000, -1.51489246e+000, ... -1.48569560e+000, -1.48569560e+000],
            ...
            ...
            [8.66091192e-001, 8.80311251e-001, ... -1.36645925e+000, -1.45177972e+000],
            [8.09210956e-001, 8.23431015e-001, ... -1.29535890e+000, -1.43755960e+000],
            [7.09670484e-001, 7.94990897e-001, ... -1.26691878e+000, -1.42333949e+000]]]])
    """
    def __init__(self, image_resolution: Optional[int] = 224):
        super(CLIPImageProcessor, self).__init__(
            image_resolution=image_resolution)
        self.bchw2bhwc = BCHW2BHWC()
        self.batch_pilizer = BatchPILize()
        self.batch_resizer = BatchResize(image_resolution)
        self.batch_crop = BatchCenterCrop(image_resolution)
        self.batch_totensor = BatchToTensor()
        self.batch_normalizer = BatchNormalize()

[文档]    def preprocess(self, images: Union[ms.Tensor, PIL.Image.Image,
                                       np.ndarray, List[PIL.Image.Image]], **kwargs):
        r"""
        Preprocess Required By Base Processor.

        Args:
            images (ms.Tensor, PIL.Image, numpy.array, List[PIL.Image]): A batch of images.

        Return:
            A 4-rank tensor for a batch of images.
        """
        if not self._bhwc_check(images):
            images = self.bchw2bhwc(images)
        images = self.batch_pilizer(images)
        images = self.batch_resizer(images)
        images = self.batch_crop(images)
        images = self.batch_totensor(images)
        images = self.batch_normalizer(images)

        kwargs.pop("other", None)
        if isinstance(images, list):
            return ms.Tensor(np.row_stack([np.expand_dims(item, axis=0) for item in images]))
        if len(images.shape) == 4:
            return ms.Tensor(images)
        return ms.Tensor(np.expand_dims(images, axis=0))

    def _bhwc_check(self, image_batch: Union[ms.Tensor, PIL.Image.Image,
                                             np.ndarray, List[PIL.Image.Image]]):
        r"""Bhwc_check"""
        if isinstance(image_batch, np.ndarray):
            if image_batch.shape[-1] == 3:
                return True
        if isinstance(image_batch, ms.Tensor):
            if image_batch.asnumpy().shape[-1] == 3:
                return True
        if isinstance(image_batch, (list, PIL.Image.Image)):
            return True
        return False


[文档]@MindFormerRegister.register(MindFormerModuleType.PROCESSOR)
class CLIPProcessor(BaseProcessor):
    r"""CLIP Processor,
    consists of a feature extractor (BaseFeatureEXtractor) for image input,
    and a tokenizer (BaseTokenizer) for text input.

    Args:
        image_processor (BaseImageProcessor): Used for process image data.
        tokenizer (BaseTokenizer): Used for process text data.
        max_length (Optional[int]): The length of text tokens.
        padding (Optional[str]): The padding strategy of tokenizer, [None, "max_length"].
        return_tensors (Optional[str]): The type of returned tensors for tokenizer, [None, "ms"].

    Examples:
        >>> from mindformers import CLIPProcessor
        >>> from mindformers.tools.image_tools import load_image
        >>> image = load_image("https://ascend-repo-modelzoo.obs.cn-east-2."
        ...                    "myhuaweicloud.com/XFormer_for_mindspore/clip/sunflower.png")
        >>> text = ["a boy", "a girl"]
        >>> CLIPProcessor.show_support_list()
            INFO - support list of CLIP Processor is:
            INFO -    ['clip_vit_b_32']
            INFO - -------------------------------------
        >>> processor = CLIPProcessor.from_pretrained('clip_vit_b_32')
        >>> processor(image, text)
            {'image': Tensor(shape=[1, 3, 224, 224], dtype=Float32, value=
            [[[[-1.52949083e+000, -1.52949083e+000,... -1.48569560e+000, -1.50029397e+000],
            [-1.52949083e+000, -1.52949083e+000, ... -1.50029397e+000, -1.50029397e+000],
            [-1.50029397e+000, -1.50029397e+000 ... -1.48569560e+000, -1.50029397e+000],
            ...
            [8.23431015e-001, 8.80311251e-001, ... -1.33801913e+000, -1.43755960e+000],
            [7.80770779e-001, 8.37651074e-001, ... -1.23847866e+000, -1.39489937e+000],
            [6.10130012e-001, 7.66550720e-001, ... -1.19581854e+000, -1.38067937e+000]]]]),
             'text': Tensor(shape=[2, 77], dtype=Int32, value=
            [[49406,   320,  1876 ...     0,     0,     0],
            [49406,   320,  1611 ...     0,     0,     0]])}
    """
    _support_list = MindFormerBook.get_processor_support_list()['clip']

    def __init__(self, image_processor, tokenizer,
                 max_length=77, padding='max_length', return_tensors='ms'):
        super(CLIPProcessor, self).__init__(
            image_processor=image_processor,
            tokenizer=tokenizer,
            max_length=max_length,
            padding=padding,
            return_tensors=return_tensors)