dejanseo commited on
Commit
6f18aef
·
verified ·
1 Parent(s): d552aac

Upload 5 files

Browse files
.gitattributes CHANGED
@@ -71,3 +71,4 @@ OptGuideOnDeviceModel/V1/optimization_guide_internal.dll filter=lfs diff=lfs mer
71
  OptGuideOnDeviceModel/V1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
  SODA/2311.17901v1.pdf filter=lfs diff=lfs merge=lfs -text
73
  19/en.fb filter=lfs diff=lfs merge=lfs -text
 
 
71
  OptGuideOnDeviceModel/V1/tokenizer.json filter=lfs diff=lfs merge=lfs -text
72
  SODA/2311.17901v1.pdf filter=lfs diff=lfs merge=lfs -text
73
  19/en.fb filter=lfs diff=lfs merge=lfs -text
74
+ 19/en.json filter=lfs diff=lfs merge=lfs -text
19/classifier.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+
4
+ # Load model JSON
5
+ with open("en.json", "r") as f:
6
+ model_data = json.load(f)
7
+
8
+ # Define regex patterns
9
+ patterns = {
10
+ "phone": r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b",
11
+ "url": r"https?://\S+|www\.\S+",
12
+ "email": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
13
+ }
14
+
15
+ # Simulate entity classification
16
+ def classify_text(text):
17
+ annotations = []
18
+
19
+ for entity, pattern in patterns.items():
20
+ matches = re.findall(pattern, text)
21
+ for match in matches:
22
+ annotations.append({"token": match, "type": entity, "confidence_score": 0.9})
23
+
24
+ return {"annotations": annotations}
25
+
26
+ # Test classification
27
+ test_text = "Hello world this is Call 123-456-7890 or visit www.example.com or email [email protected] soe other text."
28
+ result = classify_text(test_text)
29
+ print("Classification Result:", json.dumps(result, indent=2))
19/en.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca594d46c1235d48437e84401c76ec0d709b7376b30025e653a387d3f84761b3
3
+ size 42288695
19/en_shrunk.json ADDED
The diff for this file is too large to render. See raw diff
 
19/en_truncated.json ADDED
The diff for this file is too large to render. See raw diff
 
19/json_structure.json ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "locales": "str",
3
+ "version": "int",
4
+ "name": "str",
5
+ "selection_feature_options": {
6
+ "num_buckets": "int",
7
+ "embedding_size": "int",
8
+ "context_size": "int",
9
+ "max_selection_span": "int",
10
+ "chargram_orders": [
11
+ "int"
12
+ ],
13
+ "extract_case_feature": "bool",
14
+ "remap_digits": "bool",
15
+ "lowercase_tokens": "bool",
16
+ "selection_reduced_output_space": "bool",
17
+ "default_collection": "int",
18
+ "tokenization_codepoint_config": [
19
+ {
20
+ "start": "...",
21
+ "end": "...",
22
+ "role": "..."
23
+ }
24
+ ],
25
+ "center_token_selection_method": "str",
26
+ "supported_codepoint_ranges": [
27
+ {
28
+ "end": "..."
29
+ }
30
+ ],
31
+ "min_supported_codepoint_ratio": "float",
32
+ "feature_version": "int",
33
+ "ignored_span_boundary_codepoints": [
34
+ "int"
35
+ ],
36
+ "bounds_sensitive_features": {
37
+ "enabled": "bool",
38
+ "num_tokens_before": "int",
39
+ "num_tokens_inside_left": "int",
40
+ "num_tokens_inside_right": "int",
41
+ "num_tokens_after": "int",
42
+ "include_inside_bag": "bool",
43
+ "include_inside_length": "bool",
44
+ "score_single_token_spans_as_zero": "bool"
45
+ },
46
+ "tokenize_on_script_change": "bool",
47
+ "use_pipe_character_for_newline": "bool"
48
+ },
49
+ "classification_feature_options": {
50
+ "num_buckets": "int",
51
+ "embedding_size": "int",
52
+ "context_size": "int",
53
+ "max_selection_span": "int",
54
+ "chargram_orders": [
55
+ "int"
56
+ ],
57
+ "extract_case_feature": "bool",
58
+ "remap_digits": "bool",
59
+ "lowercase_tokens": "bool",
60
+ "selection_reduced_output_space": "bool",
61
+ "collections": [
62
+ "str"
63
+ ],
64
+ "default_collection": "int",
65
+ "split_tokens_on_selection_boundaries": "bool",
66
+ "tokenization_codepoint_config": [
67
+ {
68
+ "start": "...",
69
+ "end": "...",
70
+ "role": "..."
71
+ }
72
+ ],
73
+ "center_token_selection_method": "str",
74
+ "supported_codepoint_ranges": [
75
+ {
76
+ "end": "..."
77
+ }
78
+ ],
79
+ "min_supported_codepoint_ratio": "float",
80
+ "feature_version": "int",
81
+ "ignored_span_boundary_codepoints": [
82
+ "int"
83
+ ],
84
+ "bounds_sensitive_features": {
85
+ "enabled": "bool",
86
+ "num_tokens_before": "int",
87
+ "num_tokens_inside_left": "int",
88
+ "num_tokens_inside_right": "int",
89
+ "num_tokens_after": "int",
90
+ "include_inside_bag": "bool",
91
+ "include_inside_length": "bool",
92
+ "score_single_token_spans_as_zero": "bool"
93
+ },
94
+ "tokenize_on_script_change": "bool"
95
+ },
96
+ "selection_model": [
97
+ "int"
98
+ ],
99
+ "classification_model": [
100
+ "int"
101
+ ],
102
+ "embedding_model": [
103
+ "int"
104
+ ],
105
+ "selection_options": {},
106
+ "classification_options": {
107
+ "phone_min_num_digits": "int",
108
+ "address_min_num_tokens": "int"
109
+ },
110
+ "regex_model": {
111
+ "patterns": [
112
+ {
113
+ "collection_name": "...",
114
+ "priority_score": "...",
115
+ "compressed_pattern": "..."
116
+ }
117
+ ]
118
+ },
119
+ "datetime_model": {
120
+ "locales": [
121
+ "str"
122
+ ],
123
+ "patterns": [
124
+ {
125
+ "regexes": "...",
126
+ "locales": "...",
127
+ "priority_score": "..."
128
+ }
129
+ ],
130
+ "extractors": [
131
+ {
132
+ "extractor": "...",
133
+ "locales": "...",
134
+ "compressed_pattern": "..."
135
+ }
136
+ ],
137
+ "default_locales": [
138
+ "int"
139
+ ],
140
+ "generate_alternative_interpretations_when_ambiguous": "bool",
141
+ "prefer_future_for_unspecified_date": "bool"
142
+ },
143
+ "triggering_options": {
144
+ "dictionary_locales": "str",
145
+ "collection_to_priority": [
146
+ {
147
+ "key": "...",
148
+ "value": "..."
149
+ }
150
+ ]
151
+ },
152
+ "output_options": {
153
+ "filtered_collections_annotation": [
154
+ "str"
155
+ ],
156
+ "filtered_collections_classification": [
157
+ "str"
158
+ ],
159
+ "filtered_collections_selection": [
160
+ "str"
161
+ ]
162
+ },
163
+ "intent_options": {
164
+ "generator": [
165
+ {
166
+ "type": "...",
167
+ "compressed_lua_template_generator": "..."
168
+ }
169
+ ]
170
+ },
171
+ "resources": {
172
+ "locale": [
173
+ {}
174
+ ],
175
+ "resource_entry": [
176
+ {
177
+ "name": "...",
178
+ "resource": "..."
179
+ }
180
+ ]
181
+ },
182
+ "entity_data_schema": [
183
+ "int"
184
+ ],
185
+ "number_annotator_options": {
186
+ "enabled": "bool",
187
+ "priority_score": "float",
188
+ "enabled_annotation_usecases": "int",
189
+ "allowed_prefix_codepoints": [
190
+ "int"
191
+ ],
192
+ "allowed_suffix_codepoints": [
193
+ "int"
194
+ ],
195
+ "ignored_prefix_span_boundary_codepoints": [
196
+ "int"
197
+ ],
198
+ "ignored_suffix_span_boundary_codepoints": [
199
+ "int"
200
+ ],
201
+ "enable_percentage": "bool",
202
+ "percentage_pieces_string": "str",
203
+ "percentage_pieces_offsets": [
204
+ "int"
205
+ ],
206
+ "float_number_priority_score": "float",
207
+ "percentage_annotation_usecases": "int"
208
+ },
209
+ "duration_annotator_options": {
210
+ "enabled": "bool",
211
+ "priority_score": "float",
212
+ "enabled_annotation_usecases": "int",
213
+ "week_expressions": [
214
+ "str"
215
+ ],
216
+ "day_expressions": [
217
+ "str"
218
+ ],
219
+ "hour_expressions": [
220
+ "str"
221
+ ],
222
+ "minute_expressions": [
223
+ "str"
224
+ ],
225
+ "second_expressions": [
226
+ "str"
227
+ ],
228
+ "filler_expressions": [
229
+ "str"
230
+ ],
231
+ "half_expressions": [
232
+ "str"
233
+ ],
234
+ "sub_token_separator_codepoints": [
235
+ "int"
236
+ ]
237
+ },
238
+ "embedding_pruning_mask": {},
239
+ "contact_annotator_options": {
240
+ "enable_declension": "bool",
241
+ "language": "str"
242
+ },
243
+ "money_parsing_options": {
244
+ "separators": [
245
+ "int"
246
+ ],
247
+ "quantities_name_to_exponent": [
248
+ {
249
+ "key": "...",
250
+ "value": "..."
251
+ }
252
+ ]
253
+ },
254
+ "translate_annotator_options": {
255
+ "enabled": "bool",
256
+ "priority_score": "float",
257
+ "algorithm": "str",
258
+ "backoff_options": {}
259
+ },
260
+ "conflict_resolution_options": {
261
+ "prioritize_longest_annotation": "bool",
262
+ "do_conflict_resolution_in_raw_mode": "bool"
263
+ },
264
+ "pod_ner_model": {
265
+ "tflite_model": [
266
+ "int"
267
+ ],
268
+ "word_piece_vocab": [
269
+ "int"
270
+ ],
271
+ "logits_index_in_output_tensor": "int",
272
+ "priority_score": "float",
273
+ "labels": [
274
+ {
275
+ "boise_type": "...",
276
+ "mention_type": "...",
277
+ "collection_id": "..."
278
+ }
279
+ ],
280
+ "collections": [
281
+ {
282
+ "name": "...",
283
+ "single_token_priority_score": "...",
284
+ "multi_token_priority_score": "..."
285
+ }
286
+ ],
287
+ "min_number_of_tokens": "int",
288
+ "min_number_of_wordpieces": "int"
289
+ },
290
+ "vocab_model": {
291
+ "vocab_trie": [
292
+ "int"
293
+ ],
294
+ "beginner_level": {
295
+ "dense_data": {
296
+ "data": "...",
297
+ "size": "..."
298
+ }
299
+ },
300
+ "do_not_trigger_in_upper_case": {
301
+ "sparse_data": {
302
+ "sorted_indices_32": "..."
303
+ }
304
+ },
305
+ "triggering_locales": "str",
306
+ "priority_score": "float"
307
+ }
308
+ }