Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
19/classifier.py +29 -0
19/en.json +3 -0
19/en_shrunk.json +0 -0
19/en_truncated.json +0 -0
19/json_structure.json +308 -0

.gitattributes CHANGED Viewed

@@ -71,3 +71,4 @@ OptGuideOnDeviceModel/V1/optimization_guide_internal.dll filter=lfs diff=lfs mer
 OptGuideOnDeviceModel/V1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 SODA/2311.17901v1.pdf filter=lfs diff=lfs merge=lfs -text
 19/en.fb filter=lfs diff=lfs merge=lfs -text

 OptGuideOnDeviceModel/V1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 SODA/2311.17901v1.pdf filter=lfs diff=lfs merge=lfs -text
 19/en.fb filter=lfs diff=lfs merge=lfs -text
+19/en.json filter=lfs diff=lfs merge=lfs -text

19/classifier.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import json
+import re
+# Load model JSON
+with open("en.json", "r") as f:
+    model_data = json.load(f)
+# Define regex patterns
+patterns = {
+    "phone": r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b",
+    "url": r"https?://\S+|www\.\S+",
+    "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
+}
+# Simulate entity classification
+def classify_text(text):
+    annotations = []
+    for entity, pattern in patterns.items():
+        matches = re.findall(pattern, text)
+        for match in matches:
+            annotations.append({"token": match, "type": entity, "confidence_score": 0.9})
+    return {"annotations": annotations}
+# Test classification
+test_text = "Hello world this is Call 123-456-7890 or visit www.example.com or email [email protected] soe other text."
+result = classify_text(test_text)
+print("Classification Result:", json.dumps(result, indent=2))

19/en.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ca594d46c1235d48437e84401c76ec0d709b7376b30025e653a387d3f84761b3
+size 42288695

19/en_shrunk.json ADDED Viewed

The diff for this file is too large to render. See raw diff

19/en_truncated.json ADDED Viewed

The diff for this file is too large to render. See raw diff

19/json_structure.json ADDED Viewed

	@@ -0,0 +1,308 @@

+{
+  "locales": "str",
+  "version": "int",
+  "name": "str",
+  "selection_feature_options": {
+    "num_buckets": "int",
+    "embedding_size": "int",
+    "context_size": "int",
+    "max_selection_span": "int",
+    "chargram_orders": [
+      "int"
+    ],
+    "extract_case_feature": "bool",
+    "remap_digits": "bool",
+    "lowercase_tokens": "bool",
+    "selection_reduced_output_space": "bool",
+    "default_collection": "int",
+    "tokenization_codepoint_config": [
+      {
+        "start": "...",
+        "end": "...",
+        "role": "..."
+      }
+    ],
+    "center_token_selection_method": "str",
+    "supported_codepoint_ranges": [
+      {
+        "end": "..."
+      }
+    ],
+    "min_supported_codepoint_ratio": "float",
+    "feature_version": "int",
+    "ignored_span_boundary_codepoints": [
+      "int"
+    ],
+    "bounds_sensitive_features": {
+      "enabled": "bool",
+      "num_tokens_before": "int",
+      "num_tokens_inside_left": "int",
+      "num_tokens_inside_right": "int",
+      "num_tokens_after": "int",
+      "include_inside_bag": "bool",
+      "include_inside_length": "bool",
+      "score_single_token_spans_as_zero": "bool"
+    },
+    "tokenize_on_script_change": "bool",
+    "use_pipe_character_for_newline": "bool"
+  },
+  "classification_feature_options": {
+    "num_buckets": "int",
+    "embedding_size": "int",
+    "context_size": "int",
+    "max_selection_span": "int",
+    "chargram_orders": [
+      "int"
+    ],
+    "extract_case_feature": "bool",
+    "remap_digits": "bool",
+    "lowercase_tokens": "bool",
+    "selection_reduced_output_space": "bool",
+    "collections": [
+      "str"
+    ],
+    "default_collection": "int",
+    "split_tokens_on_selection_boundaries": "bool",
+    "tokenization_codepoint_config": [
+      {
+        "start": "...",
+        "end": "...",
+        "role": "..."
+      }
+    ],
+    "center_token_selection_method": "str",
+    "supported_codepoint_ranges": [
+      {
+        "end": "..."
+      }
+    ],
+    "min_supported_codepoint_ratio": "float",
+    "feature_version": "int",
+    "ignored_span_boundary_codepoints": [
+      "int"
+    ],
+    "bounds_sensitive_features": {
+      "enabled": "bool",
+      "num_tokens_before": "int",
+      "num_tokens_inside_left": "int",
+      "num_tokens_inside_right": "int",
+      "num_tokens_after": "int",
+      "include_inside_bag": "bool",
+      "include_inside_length": "bool",
+      "score_single_token_spans_as_zero": "bool"
+    },
+    "tokenize_on_script_change": "bool"
+  },
+  "selection_model": [
+    "int"
+  ],
+  "classification_model": [
+    "int"
+  ],
+  "embedding_model": [
+    "int"
+  ],
+  "selection_options": {},
+  "classification_options": {
+    "phone_min_num_digits": "int",
+    "address_min_num_tokens": "int"
+  },
+  "regex_model": {
+    "patterns": [
+      {
+        "collection_name": "...",
+        "priority_score": "...",
+        "compressed_pattern": "..."
+      }
+    ]
+  },
+  "datetime_model": {
+    "locales": [
+      "str"
+    ],
+    "patterns": [
+      {
+        "regexes": "...",
+        "locales": "...",
+        "priority_score": "..."
+      }
+    ],
+    "extractors": [
+      {
+        "extractor": "...",
+        "locales": "...",
+        "compressed_pattern": "..."
+      }
+    ],
+    "default_locales": [
+      "int"
+    ],
+    "generate_alternative_interpretations_when_ambiguous": "bool",
+    "prefer_future_for_unspecified_date": "bool"
+  },
+  "triggering_options": {
+    "dictionary_locales": "str",
+    "collection_to_priority": [
+      {
+        "key": "...",
+        "value": "..."
+      }
+    ]
+  },
+  "output_options": {
+    "filtered_collections_annotation": [
+      "str"
+    ],
+    "filtered_collections_classification": [
+      "str"
+    ],
+    "filtered_collections_selection": [
+      "str"
+    ]
+  },
+  "intent_options": {
+    "generator": [
+      {
+        "type": "...",
+        "compressed_lua_template_generator": "..."
+      }
+    ]
+  },
+  "resources": {
+    "locale": [
+      {}
+    ],
+    "resource_entry": [
+      {
+        "name": "...",
+        "resource": "..."
+      }
+    ]
+  },
+  "entity_data_schema": [
+    "int"
+  ],
+  "number_annotator_options": {
+    "enabled": "bool",
+    "priority_score": "float",
+    "enabled_annotation_usecases": "int",
+    "allowed_prefix_codepoints": [
+      "int"
+    ],
+    "allowed_suffix_codepoints": [
+      "int"
+    ],
+    "ignored_prefix_span_boundary_codepoints": [
+      "int"
+    ],
+    "ignored_suffix_span_boundary_codepoints": [
+      "int"
+    ],
+    "enable_percentage": "bool",
+    "percentage_pieces_string": "str",
+    "percentage_pieces_offsets": [
+      "int"
+    ],
+    "float_number_priority_score": "float",
+    "percentage_annotation_usecases": "int"
+  },
+  "duration_annotator_options": {
+    "enabled": "bool",
+    "priority_score": "float",
+    "enabled_annotation_usecases": "int",
+    "week_expressions": [
+      "str"
+    ],
+    "day_expressions": [
+      "str"
+    ],
+    "hour_expressions": [
+      "str"
+    ],
+    "minute_expressions": [
+      "str"
+    ],
+    "second_expressions": [
+      "str"
+    ],
+    "filler_expressions": [
+      "str"
+    ],
+    "half_expressions": [
+      "str"
+    ],
+    "sub_token_separator_codepoints": [
+      "int"
+    ]
+  },
+  "embedding_pruning_mask": {},
+  "contact_annotator_options": {
+    "enable_declension": "bool",
+    "language": "str"
+  },
+  "money_parsing_options": {
+    "separators": [
+      "int"
+    ],
+    "quantities_name_to_exponent": [
+      {
+        "key": "...",
+        "value": "..."
+      }
+    ]
+  },
+  "translate_annotator_options": {
+    "enabled": "bool",
+    "priority_score": "float",
+    "algorithm": "str",
+    "backoff_options": {}
+  },
+  "conflict_resolution_options": {
+    "prioritize_longest_annotation": "bool",
+    "do_conflict_resolution_in_raw_mode": "bool"
+  },
+  "pod_ner_model": {
+    "tflite_model": [
+      "int"
+    ],
+    "word_piece_vocab": [
+      "int"
+    ],
+    "logits_index_in_output_tensor": "int",
+    "priority_score": "float",
+    "labels": [
+      {
+        "boise_type": "...",
+        "mention_type": "...",
+        "collection_id": "..."
+      }
+    ],
+    "collections": [
+      {
+        "name": "...",
+        "single_token_priority_score": "...",
+        "multi_token_priority_score": "..."
+      }
+    ],
+    "min_number_of_tokens": "int",
+    "min_number_of_wordpieces": "int"
+  },
+  "vocab_model": {
+    "vocab_trie": [
+      "int"
+    ],
+    "beginner_level": {
+      "dense_data": {
+        "data": "...",
+        "size": "..."
+      }
+    },
+    "do_not_trigger_in_upper_case": {
+      "sparse_data": {
+        "sorted_indices_32": "..."
+      }
+    },
+    "triggering_locales": "str",
+    "priority_score": "float"
+  }
+}