mindformers.models.clip.clip 源代码

# Copyright 2022 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

"""
CLIPModel
"""
from typing import Optional, Union

import numpy as np
import mindspore as ms

from mindspore import nn
from mindspore.ops import functional as F
from mindspore.common.initializer import Normal, initializer
from mindspore import Parameter, Tensor
import mindspore.ops as ops

from mindformers.version_control import get_norm
from ...mindformer_book import MindFormerBook
from ..base_model import BaseModel
from .clip_modules import VisionTransformer, Transformer, LayerNorm
from .clip_config import CLIPConfig
from ...tools.register import MindFormerRegister, MindFormerModuleType


[文档]@MindFormerRegister.register(MindFormerModuleType.MODELS) class CLIPModel(BaseModel): r"""CLIPModel. The supported model name could be selected from CLIPModel.show_support_list(). Args: config (CLIPConfig): The config of clip model, which could be obtained by CLIPConfig class. Examples: >>> from mindformers import CLIPModel >>> CLIPModel.show_support_list() INFO - support list of CLIPModel is: INFO - ['clip_vit_b_32'] INFO - ------------------------------------- >>> model = CLIPModel.from_pretrained('clip_vit_b_32') >>> type(model) <class 'mindformers.models.clip.clip.CLIPModel'> """ _support_list = MindFormerBook.get_model_support_list()['clip'] def __init__(self, config: CLIPConfig): super(CLIPModel, self).__init__(config) self.dtype = self.get_dtype(config.dtype) self.cross_entropy = nn.SoftmaxCrossEntropyWithLogits(reduction="mean", sparse=True) self.max_position_embeddings = config.text_config.max_position_embeddings self.visual = VisionTransformer( input_resolution=config.vision_config.image_size, patch_size=config.vision_config.patch_size, width=config.vision_config.hidden_size, layers=config.vision_config.num_hidden_layers, heads=config.vision_config.num_attention_heads, output_dim=config.projection_dim, dtype=self.dtype ) self.transformer = Transformer( width=config.text_config.hidden_size, layers=config.text_config.num_hidden_layers, heads=config.text_config.num_attention_heads, dtype=self.dtype, attn_mask=self.build_attention_mask() ) self.token_embedding = \ nn.Embedding(config.text_config.vocab_size, config.text_config.hidden_size, embedding_table=Normal(mean=0.0, sigma=0.02)) self.positional_embedding = Parameter(initializer( Normal(mean=0.0, sigma=0.01), [config.text_config.max_position_embeddings, config.text_config.hidden_size])) self.ln_final = LayerNorm([config.text_config.hidden_size]) self.text_projection = Parameter(initializer( Normal(mean=0.0, sigma=config.text_config.hidden_size ** -0.5), [config.text_config.hidden_size, config.projection_dim], ms.float32)) self.logit_scale = Parameter(Tensor(np.log(1 / 0.07)).astype(ms.float32)) self.exp = ops.Exp() self.norm = get_norm() self.load_checkpoint(config)
[文档] def get_dtype(self, dtype: str): """Get_dtype""" if dtype == "float16": return ms.float16 if dtype == "float32": return ms.float32 raise TypeError("unsupported data type.")
def construct(self, image: ms.Tensor, text: ms.Tensor, label: Optional[Union[ms.Tensor, np.ndarray]] = None, input_ids: Optional[ms.Tensor] = None, pixel_values: Optional[ms.Tensor] = None): r"""Construct Args: image (Tensor): A image tensor processed by image_processor. text (Tensor): A text id tensor processed by tokenizer. input_ids (Optional[ms.Tensor]): Equal to "text", if "input_ids" is set, "text" is useless. pixel_values (Optional[ms.Tensor]): Equal to "image", if "pixel_values" is set, "image" is useless. label (Optional[Union[ms.Tensor, np.ndarray]]): The classification label. Returns: if not self.trainining: if label is None: logits_per_image: Similarity between image and text. logits_per_text: Similarity between text and image. else: logits_per_image: Similarity between image and text. label: The classification label. else: loss: Constructive language image pretraining loss. Examples: >>> import numpy as np >>> from mindformers import CLIPModel, CLIPProcessor >>> processor = CLIPProcessor.from_pretrained('clip_vit_b_32') >>> model = CLIPModel.from_pretrained('clip_vit_b_32') >>> fake_image_batch = np.random.random((5, 3, 578, 213)) >>> fake_text_batch = ["a boy", "a girl", "a women", "a men"] >>> model(**processor(fake_image_batch, fake_text_batch)) (Tensor(shape=[5, 4], dtype=Float32, value= [[2.26097965e+001, 2.29247952e+001, 2.40179482e+001, 2.30396290e+001], [2.26102257e+001, 2.29256859e+001, 2.40180817e+001, 2.30393028e+001], [2.26097965e+001, 2.29247952e+001, 2.40179482e+001, 2.30396290e+001], [2.26109924e+001, 2.29261818e+001, 2.40193062e+001, 2.30404854e+001], [2.26097965e+001, 2.29247952e+001, 2.40179482e+001, 2.30396290e+001]]), Tensor(shape=[4, 5], dtype=Float32, value= ...)) """ if pixel_values is not None: image = pixel_values if input_ids is not None: text = input_ids if len(text.shape) == 3: text = text[0].squeeze() image_features = self.get_image_features(image) text_features = self. get_text_features(text) image_features = image_features / self.norm(image_features, dim=1, keepdim=True) text_features = text_features / self.norm(text_features, dim=1, keepdim=True) logit_scale = self.exp(self.logit_scale) if not self.training: if label is None: logits_per_image = ops.matmul(logit_scale * image_features, text_features.T) logits_per_text = logits_per_image.T return logits_per_image, logits_per_text logits_per_image = ops.matmul(logit_scale * image_features, text_features.T) return logits_per_image, label logits = ops.matmul(logit_scale * image_features, text_features.T) batch_size, _ = F.shape(logits) labels = ms.Tensor(np.arange(batch_size)) images_loss = self.cross_entropy(logits, labels) texts_loss = self.cross_entropy(logits.T, labels) loss = (images_loss + texts_loss) / 2 return loss
[文档] def build_attention_mask(self): """Build_attention_mask""" mask = np.ones((self.max_position_embeddings, self.max_position_embeddings)) mask = np.triu(mask * float("-inf"), k=1) return Tensor(mask).astype(self.dtype)
[文档] def get_image_features(self, image: ms.Tensor, pixel_values: Optional[ms.Tensor] = None): r"""Get_image_features Args: image (ms.Tensor): A image tensor processed by image_processor. pixel_values (Optional[ms.Tensor]): Equal to "image", if "pixel_values" is set, "image" is useless. Returns: Image feature. Examples: >>> import numpy as np >>> from mindformers import CLIPModel, CLIPProcessor >>> processor = CLIPProcessor.from_pretrained('clip_vit_b_32') >>> model = CLIPModel.from_pretrained('clip_vit_b_32') >>> fake_image_batch = np.random.random((5, 3, 578, 213)) >>> model.get_image_features(processor.image_processor(fake_image_batch)) Tensor(shape=[5, 512], dtype=Float32, value= [[-1.50102973e-001, -2.63687313e-001, -5.65953791e-001 ... -2.93511450e-001], [-1.50103331e-001, -2.63622820e-001, -5.65623760e-001 ... -2.93337226e-001], [-1.50102973e-001, -2.63687313e-001, -5.65953791e-001 ... -2.93511450e-001], [-1.49712294e-001, -2.64100820e-001, -5.65740824e-001 ... -2.93599486e-001], [-1.50102973e-001, -2.63687313e-001, -5.65953791e-001 ... -2.93511450e-001]]) """ if pixel_values is not None: image = pixel_values image = image.astype(self.dtype) return self.visual(image)
[文档] def get_text_features(self, text: ms.Tensor, input_ids: Optional[ms.Tensor] = None): r"""Get_text_features Args: text (ms.Tensor): A text id tensor processed by tokenizer. input_ids (Optional[ms.Tensor]): Equal to "text", if "input_ids" is set, "text" is useless. Returns: Text feature. Examples: >>> from mindformers import CLIPModel, CLIPProcessor >>> processor = CLIPProcessor.from_pretrained('clip_vit_b_32') >>> model = CLIPModel.from_pretrained('clip_vit_b_32') >>> fake_text_batch = ["a boy", "a girl", "a women", "a men"] >>> text = processor.tokenizer( ... fake_text_batch, max_length=77, padding="max_length", return_tensors="ms" ... )["input_ids"] >>> model.get_text_features(text) Tensor(shape=[4, 512], dtype=Float32, value= [[6.03631809e-002, 1.79528534e-001, ... -2.23753393e-001, 1.42413378e-002], [1.28974199e-001, 7.46373609e-002, ... -3.68579805e-001, 1.53980583e-001], [9.89909172e-002, 2.01410800e-002, ... -2.54495114e-001, 7.68117979e-002], [3.16975415e-002, 2.26992741e-001, ... -5.22942394e-002, 1.98922127e-001]]) """ if input_ids is not None: text = input_ids text_ = self.token_embedding(text).astype(self.dtype) text_ = ops.Add()(text_, self.positional_embedding).astype(self.dtype) text_ = text_.transpose(1, 0, 2) text_ = self.transformer(text_) text_ = text_.transpose(1, 0, 2) text_ = self.ln_final(text_).astype(ms.float32) text_ = ops.MatMul()( text_[ms.numpy.arange(text_.shape[0]), text.argmax(-1)], self.text_projection) return text_