Upload model

Browse files

Files changed (5) hide show

README.md +199 -0
config.json +43 -0
configuration.py +37 -0
model.safetensors +3 -0
modeling.py +251 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "architectures": [
+    "KPRModelForBert"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "auto_map": {
+    "AutoConfig": "configuration.KPRConfigForBert",
+    "AutoModel": "modeling.KPRModelForBert"
+  },
+  "classifier_dropout": null,
+  "entity_embedding_size": 768,
+  "entity_fusion_activation": "sigmoid",
+  "entity_fusion_method": "multihead_attention",
+  "entity_vocab_size": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "kpr-bert",
+  "num_attention_heads": 12,
+  "num_entity_fusion_attention_heads": 1,
+  "num_hidden_layers": 12,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "similarity_function": "cosine",
+  "similarity_temperature": 0.02,
+  "torch_dtype": "float32",
+  "transformers_version": "4.55.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "use_entity_position_embeddings": true,
+  "vocab_size": 30522
+}

configuration.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from transformers.models.bert import BertConfig
+from transformers.models.xlm_roberta import XLMRobertaConfig
+def _init_function(
+    self,
+    entity_vocab_size: int | None = 10000,
+    entity_embedding_size: int = 768,
+    entity_fusion_method: str = "multihead_attention",
+    use_entity_position_embeddings: bool = True,
+    entity_fusion_activation: str = "softmax",
+    num_entity_fusion_attention_heads: int = 12,
+    similarity_function: str = "dot",
+    similarity_temperature: float = 1.0,
+    *args,
+    **kwargs,
+):
+    self.entity_vocab_size = entity_vocab_size
+    self.entity_embedding_size = entity_embedding_size
+    self.entity_fusion_method = entity_fusion_method
+    self.use_entity_position_embeddings = use_entity_position_embeddings
+    self.entity_fusion_activation = entity_fusion_activation
+    self.num_entity_fusion_attention_heads = num_entity_fusion_attention_heads
+    self.similarity_function = similarity_function
+    self.similarity_temperature = similarity_temperature
+    super(self.__class__, self).__init__(*args, **kwargs)
+class KPRConfigForBert(BertConfig):
+    __init__ = _init_function
+    model_type = "kpr-bert"
+class KPRConfigForXLMRoberta(XLMRobertaConfig):
+    __init__ = _init_function
+    model_type = "kpr-xlm-roberta"

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94a371d44cc6e02eb0b65f72235ab0ad0b239ac7ddab67fa36769315439287f9
+size 448988504

modeling.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import math
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import Tensor, nn
+from transformers import PretrainedConfig
+from transformers.file_utils import ModelOutput
+from transformers.models.bert import BertModel, BertPreTrainedModel
+from transformers.models.xlm_roberta import XLMRobertaModel, XLMRobertaPreTrainedModel
+from .configuration import KPRConfigForBert, KPRConfigForXLMRoberta
+class EntityEmbeddings(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        if config.entity_vocab_size is not None:
+            self.embeddings = nn.Embedding(config.entity_vocab_size, config.entity_embedding_size, padding_idx=0)
+            self.embeddings.weight.requires_grad = False
+        # The 0-th position corresponds to the [CLS] token which does not correspond to any entity
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size, padding_idx=0)
+        self.dense = nn.Linear(config.entity_embedding_size, config.hidden_size, bias=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, entity_ids: Tensor | None, entity_embeds: Tensor | None, entity_position_ids: Tensor) -> Tensor:
+        if entity_embeds is not None:
+            entity_embeddings = entity_embeds
+        elif entity_ids is not None:
+            if self.config.entity_vocab_size is None:
+                raise ValueError("Entity embeddings are not constructed because entity_vocab_size is None.")
+            entity_embeddings = self.embeddings(entity_ids)
+        else:
+            raise ValueError("Either entity_ids or entity_embeds need to be provided.")
+        entity_embeddings = self.dense(entity_embeddings)
+        if self.config.use_entity_position_embeddings:
+            entity_position_embeddings = self.position_embeddings(
+                entity_position_ids
+            )  # batch, entities, positions, hidden
+            entity_position_embeddings = torch.sum(entity_position_embeddings, dim=2)
+            entity_position_embeddings = entity_position_embeddings / entity_position_ids.ne(0).sum(dim=2).clamp(
+                min=1
+            ).unsqueeze(-1)
+            entity_embeddings = entity_embeddings + entity_position_embeddings
+        entity_embeddings = self.LayerNorm(entity_embeddings)
+        entity_embeddings = self.dropout(entity_embeddings)
+        return entity_embeddings
+class EntityFusionMultiHeadAttention(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.num_attention_heads = config.num_entity_fusion_attention_heads
+        self.attention_head_size = int(config.hidden_size / self.num_attention_heads)
+        self.query = nn.Linear(config.hidden_size, config.hidden_size)
+        self.key = nn.Linear(config.hidden_size, config.hidden_size)
+        self.value = nn.Linear(config.hidden_size, config.hidden_size)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, key_padding_mask: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(query))
+        key_layer = self.transpose_for_scores(self.key(key))
+        value_layer = self.transpose_for_scores(self.value(value))
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        dtype = attention_scores.dtype
+        key_padding_mask_scores = key_padding_mask[:, None, None, :]
+        key_padding_mask_scores = key_padding_mask_scores.to(dtype=dtype)
+        key_padding_mask_scores = key_padding_mask_scores * torch.finfo(dtype).min
+        attention_scores = attention_scores + key_padding_mask_scores
+        orig_attention_scores = attention_scores.clone()
+        if self.config.entity_fusion_activation == "sigmoid":
+            # https://arxiv.org/abs/2409.04431
+            entity_fusion_sigmoid_bias = key_padding_mask.eq(0).sum(dim=-1, keepdim=True)[:, :, None, None]
+            entity_fusion_sigmoid_bias = entity_fusion_sigmoid_bias.to(dtype)
+            entity_fusion_sigmoid_bias = -torch.log(entity_fusion_sigmoid_bias)
+            attention_scores = attention_scores + entity_fusion_sigmoid_bias
+            normalized_attention_scores = torch.sigmoid(attention_scores)
+        else:
+            normalized_attention_scores = nn.functional.softmax(attention_scores, dim=-1)
+        context_layer = torch.matmul(normalized_attention_scores, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.config.hidden_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        return (context_layer, orig_attention_scores)
+class EntityFusionLayer(nn.Module):
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.config = config
+        self.entity_embeddings = EntityEmbeddings(config)
+        self.entity_fusion_layer = EntityFusionMultiHeadAttention(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.noop_embeddings = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+    def forward(
+        self,
+        entity_ids: Tensor | None,
+        entity_embeds: Tensor | None,
+        entity_position_ids: Tensor,
+        cls_embeddings: Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        entity_embeddings = self.entity_embeddings(entity_ids, entity_embeds, entity_position_ids)
+        batch_size = entity_ids.size(0)
+        kv_embeddings = entity_embeddings
+        key_padding_mask = entity_ids.eq(0)
+        cls_embeddings = cls_embeddings.unsqueeze(1)
+        noop_embeddings = self.noop_embeddings.expand(batch_size, 1, -1)
+        kv_embeddings = torch.cat([noop_embeddings, kv_embeddings], dim=1)
+        noop_padding_mask = torch.zeros(batch_size, 1, device=entity_ids.device, dtype=torch.bool)
+        key_padding_mask = torch.cat([noop_padding_mask, key_padding_mask], dim=1)
+        entity_embeddings, attention_scores = self.entity_fusion_layer(
+            cls_embeddings, kv_embeddings, kv_embeddings, key_padding_mask=key_padding_mask
+        )
+        entity_embeddings = self.dropout(entity_embeddings)
+        output_embeddings = entity_embeddings + cls_embeddings
+        output_embeddings = self.LayerNorm(output_embeddings)
+        output_embeddings = output_embeddings.squeeze(1)
+        return output_embeddings, attention_scores
+class KPRMixin:
+    def _forward(self, **inputs: dict[str, Tensor]) -> tuple[Tensor] | tuple[Tensor, Tensor] | ModelOutput:
+        return_dict = inputs.pop("return_dict", True)
+        if self.training:
+            query_embeddings = self.encode(**inputs["queries"])
+            passage_embeddings = self.encode(**inputs["passages"])
+            query_embeddings = self._dist_gather_tensor(query_embeddings)
+            passage_embeddings = self._dist_gather_tensor(passage_embeddings)
+            scores = self._compute_similarity(query_embeddings, passage_embeddings)
+            scores = scores / self.config.similarity_temperature
+            scores = scores.view(query_embeddings.size(0), -1)
+            ce_target = torch.arange(scores.size(0), device=scores.device, dtype=torch.long)
+            ce_target = ce_target * (passage_embeddings.size(0) // query_embeddings.size(0))
+            loss = torch.nn.CrossEntropyLoss(reduction="mean")(scores, ce_target)
+            if return_dict:
+                return ModelOutput(loss=loss, scores=scores)
+            else:
+                return (loss, scores)
+        else:
+            sentence_embeddings = self.encode(**inputs).unsqueeze(1)
+            if return_dict:
+                return ModelOutput(sentence_embeddings=sentence_embeddings)
+            else:
+                return (sentence_embeddings,)
+    def encode(self, **inputs: dict[str, Tensor]) -> Tensor:
+        entity_ids = inputs.pop("entity_ids", None)
+        entity_position_ids = inputs.pop("entity_position_ids", None)
+        entity_embeds = inputs.pop("entity_embeds", None)
+        outputs = getattr(self, self.base_model_prefix)(**inputs)
+        output_embeddings = outputs.last_hidden_state[:, 0]
+        if self.config.entity_fusion_method != "none":
+            output_embeddings, _ = self.entity_fusion_layer(
+                entity_ids=entity_ids,
+                entity_embeds=entity_embeds,
+                entity_position_ids=entity_position_ids,
+                cls_embeddings=output_embeddings,
+            )
+        if self.config.similarity_function == "cosine":
+            output_embeddings = F.normalize(output_embeddings, p=2, dim=-1)
+        return output_embeddings
+    def _dist_gather_tensor(self, t: torch.Tensor) -> torch.Tensor:
+        t = t.contiguous()
+        tensor_list = [torch.empty_like(t) for _ in range(dist.get_world_size())]
+        dist.all_gather(tensor_list, t)
+        tensor_list[dist.get_rank()] = t
+        gathered_tensor = torch.cat(tensor_list, dim=0)
+        return gathered_tensor
+    def _compute_similarity(self, query_embeddings: Tensor, passage_embeddings: Tensor) -> Tensor:
+        return torch.matmul(query_embeddings, passage_embeddings.transpose(-2, -1))
+class KPRModelForBert(BertPreTrainedModel, KPRMixin):
+    config_class = KPRConfigForBert
+    def __init__(self, config: KPRConfigForBert):
+        BertPreTrainedModel.__init__(self, config)
+        self.bert = BertModel(config)
+        if self.config.entity_fusion_method != "none":
+            self.entity_fusion_layer = EntityFusionLayer(config)
+        self.post_init()
+    def forward(self, *args, **kwargs):
+        return self._forward(*args, **kwargs)
+class KPRModelForXLMRoberta(XLMRobertaPreTrainedModel, KPRMixin):
+    config_class = KPRConfigForXLMRoberta
+    def __init__(self, config: KPRConfigForXLMRoberta):
+        XLMRobertaPreTrainedModel.__init__(self, config)
+        self.roberta = XLMRobertaModel(config)
+        if self.config.entity_fusion_method != "none":
+            self.entity_fusion_layer = EntityFusionLayer(config)
+        self.post_init()
+    def forward(self, *args, **kwargs):
+        return self._forward(*args, **kwargs)