Add pipeline tag, library name, link to paper

This PR improves the model card by adding the `pipeline_tag: question-answering`, ensuring people can find your model at https://huggingface.co/models?pipeline_tag=question-answering. It also sets the appropriate library name (Transformers) and links to the paper on https://huggingface.co/papers/2502.11275.

Files changed (1) hide show

README.md +174 -1

README.md CHANGED Viewed

@@ -1,9 +1,13 @@
 ---
 license: mit
 ---
 # Cuckoo 🐦 [[Github]](https://github.com/KomeijiForce/Cuckoo)
 Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
 ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
@@ -155,4 +159,173 @@ sea ['blue']
 fire ['red']
 night []
 ```
-which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.

 ---
 license: mit
+library_name: transformers
+pipeline_tag: question-answering
 ---
 # Cuckoo 🐦 [[Github]](https://github.com/KomeijiForce/Cuckoo)
+This repository contains the model of the paper [Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest](https://huggingface.co/papers/2502.11275).
 Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
 ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
 fire ['red']
 night []
 ```
+which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
+# File information
+The repository contains the following file information:
+Filename: special_tokens_map.json
+Content: {
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "cls_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<pad>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}
+Filename: tokenizer_config.json
+Content: {
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<pad>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": true,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "50264": {
+      "content": "<mask>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "max_length": 512,
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "stride": 0,
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "truncation_side": "right",
+  "truncation_strategy": "longest_first",
+  "unk_token": "<unk>"
+}
+Filename: merges.txt
+Content: "Content of the file is larger than 50 KB, too long to display."
+Filename: vocab.json
+Content: "Content of the file is larger than 50 KB, too long to display."
+Filename: config.json
+Content: {
+  "_name_or_path": "models/ptr-large-c4-stage9",
+  "architectures": [
+    "RobertaForTokenClassification"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "classifier_dropout": null,
+  "eos_token_id": 2,
+  "finetuning_task": "ner",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "B",
+    "1": "I",
+    "2": "O"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "B": 0,
+    "I": 1,
+    "O": 2
+  },
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.45.2",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}
+Filename: tokenizer.json
+Content: "Content of the file is larger than 50 KB, too long to display."