Update clean/clean.py
Browse files- clean/clean.py +19 -1
clean/clean.py
CHANGED
@@ -1,3 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import functools
|
2 |
import gzip
|
3 |
import hashlib
|
@@ -127,7 +144,8 @@ def clean_text(text,
|
|
127 |
counter_inc_fn("filtered:too_few_sentences")
|
128 |
return
|
129 |
counter_inc_fn("passed")
|
130 |
-
result = "\
|
|
|
131 |
return result
|
132 |
|
133 |
|
|
|
1 |
+
# Code adapted from https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/text/c4_utils.py
|
2 |
+
# that has the following license
|
3 |
+
# coding=utf-8
|
4 |
+
# Copyright 2021 The TensorFlow Datasets Authors.
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
|
18 |
import functools
|
19 |
import gzip
|
20 |
import hashlib
|
|
|
144 |
counter_inc_fn("filtered:too_few_sentences")
|
145 |
return
|
146 |
counter_inc_fn("passed")
|
147 |
+
result = "\
|
148 |
+
".join(valid_lines).strip()
|
149 |
return result
|
150 |
|
151 |
|