nielsr HF staff commited on
Commit
564c4b6
verified
1 Parent(s): d956247

Add pipeline tag, library name, link to paper

Browse files

This PR improves the model card by adding the `pipeline_tag: question-answering`, ensuring people can find your model at https://huggingface.co/models?pipeline_tag=question-answering. It also sets the appropriate library name (Transformers) and links to the paper on https://huggingface.co/papers/2502.11275.

Files changed (1) hide show
  1. README.md +174 -1
README.md CHANGED
@@ -1,9 +1,13 @@
1
  ---
2
  license: mit
 
 
3
  ---
4
 
5
  # Cuckoo 馃惁 [[Github]](https://github.com/KomeijiForce/Cuckoo)
6
 
 
 
7
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
8
 
9
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
@@ -155,4 +159,173 @@ sea ['blue']
155
  fire ['red']
156
  night []
157
  ```
158
- which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ library_name: transformers
4
+ pipeline_tag: question-answering
5
  ---
6
 
7
  # Cuckoo 馃惁 [[Github]](https://github.com/KomeijiForce/Cuckoo)
8
 
9
+ This repository contains the model of the paper [Cuckoo: An IE Free Rider Hatched by Massive Nutrition in LLM's Nest](https://huggingface.co/papers/2502.11275).
10
+
11
  Cuckoo is a small (300M) information extraction (IE) model that imitates the next token prediction paradigm of large language models. Instead of retrieving from the vocabulary, Cuckoo predicts the next tokens by tagging them in the given input context as shown below:
12
 
13
  ![cuckoo](https://github.com/user-attachments/assets/d000f275-82a7-4939-aca8-341c61a774dc)
 
159
  fire ['red']
160
  night []
161
  ```
162
+ which shows Cuckoo is not extracting any plausible spans but has the knowledge to understand the context.
163
+
164
+ # File information
165
+
166
+ The repository contains the following file information:
167
+
168
+ Filename: special_tokens_map.json
169
+ Content: {
170
+ "bos_token": {
171
+ "content": "<s>",
172
+ "lstrip": false,
173
+ "normalized": true,
174
+ "rstrip": false,
175
+ "single_word": false
176
+ },
177
+ "cls_token": {
178
+ "content": "<s>",
179
+ "lstrip": false,
180
+ "normalized": true,
181
+ "rstrip": false,
182
+ "single_word": false
183
+ },
184
+ "eos_token": {
185
+ "content": "</s>",
186
+ "lstrip": false,
187
+ "normalized": true,
188
+ "rstrip": false,
189
+ "single_word": false
190
+ },
191
+ "mask_token": {
192
+ "content": "<mask>",
193
+ "lstrip": true,
194
+ "normalized": false,
195
+ "rstrip": false,
196
+ "single_word": false
197
+ },
198
+ "pad_token": {
199
+ "content": "<pad>",
200
+ "lstrip": false,
201
+ "normalized": true,
202
+ "rstrip": false,
203
+ "single_word": false
204
+ },
205
+ "sep_token": {
206
+ "content": "</s>",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false
211
+ },
212
+ "unk_token": {
213
+ "content": "<unk>",
214
+ "lstrip": false,
215
+ "normalized": true,
216
+ "rstrip": false,
217
+ "single_word": false
218
+ }
219
+ }
220
+
221
+ Filename: tokenizer_config.json
222
+ Content: {
223
+ "add_prefix_space": true,
224
+ "added_tokens_decoder": {
225
+ "0": {
226
+ "content": "<s>",
227
+ "lstrip": false,
228
+ "normalized": true,
229
+ "rstrip": false,
230
+ "single_word": false,
231
+ "special": true
232
+ },
233
+ "1": {
234
+ "content": "<pad>",
235
+ "lstrip": false,
236
+ "normalized": true,
237
+ "rstrip": false,
238
+ "single_word": false,
239
+ "special": true
240
+ },
241
+ "2": {
242
+ "content": "</s>",
243
+ "lstrip": false,
244
+ "normalized": true,
245
+ "rstrip": false,
246
+ "single_word": false,
247
+ "special": true
248
+ },
249
+ "3": {
250
+ "content": "<unk>",
251
+ "lstrip": false,
252
+ "normalized": true,
253
+ "rstrip": false,
254
+ "single_word": false,
255
+ "special": true
256
+ },
257
+ "50264": {
258
+ "content": "<mask>",
259
+ "lstrip": true,
260
+ "normalized": false,
261
+ "rstrip": false,
262
+ "single_word": false,
263
+ "special": true
264
+ }
265
+ },
266
+ "bos_token": "<s>",
267
+ "clean_up_tokenization_spaces": false,
268
+ "cls_token": "<s>",
269
+ "eos_token": "</s>",
270
+ "errors": "replace",
271
+ "mask_token": "<mask>",
272
+ "max_length": 512,
273
+ "model_max_length": 512,
274
+ "pad_token": "<pad>",
275
+ "sep_token": "</s>",
276
+ "stride": 0,
277
+ "tokenizer_class": "RobertaTokenizer",
278
+ "trim_offsets": true,
279
+ "truncation_side": "right",
280
+ "truncation_strategy": "longest_first",
281
+ "unk_token": "<unk>"
282
+ }
283
+
284
+ Filename: merges.txt
285
+ Content: "Content of the file is larger than 50 KB, too long to display."
286
+
287
+ Filename: vocab.json
288
+ Content: "Content of the file is larger than 50 KB, too long to display."
289
+
290
+ Filename: config.json
291
+ Content: {
292
+ "_name_or_path": "models/ptr-large-c4-stage9",
293
+ "architectures": [
294
+ "RobertaForTokenClassification"
295
+ ],
296
+ "attention_probs_dropout_prob": 0.1,
297
+ "bos_token_id": 0,
298
+ "classifier_dropout": null,
299
+ "eos_token_id": 2,
300
+ "finetuning_task": "ner",
301
+ "hidden_act": "gelu",
302
+ "hidden_dropout_prob": 0.1,
303
+ "hidden_size": 1024,
304
+ "id2label": {
305
+ "0": "B",
306
+ "1": "I",
307
+ "2": "O"
308
+ },
309
+ "initializer_range": 0.02,
310
+ "intermediate_size": 4096,
311
+ "label2id": {
312
+ "B": 0,
313
+ "I": 1,
314
+ "O": 2
315
+ },
316
+ "layer_norm_eps": 1e-05,
317
+ "max_position_embeddings": 514,
318
+ "model_type": "roberta",
319
+ "num_attention_heads": 16,
320
+ "num_hidden_layers": 24,
321
+ "pad_token_id": 1,
322
+ "position_embedding_type": "absolute",
323
+ "torch_dtype": "float32",
324
+ "transformers_version": "4.45.2",
325
+ "type_vocab_size": 1,
326
+ "use_cache": true,
327
+ "vocab_size": 50265
328
+ }
329
+
330
+ Filename: tokenizer.json
331
+ Content: "Content of the file is larger than 50 KB, too long to display."