Update RepoPipeline.py
Browse files- RepoPipeline.py +22 -7
RepoPipeline.py
CHANGED
@@ -34,19 +34,35 @@ def extract_code_and_docs(text: str):
|
|
34 |
return code_set, docs_set
|
35 |
|
36 |
|
37 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
"""
|
39 |
The method for extracting requirements.
|
40 |
:param lines: requirements.
|
41 |
:return: requirement libraries.
|
42 |
"""
|
43 |
requirements_set = set()
|
|
|
44 |
for line in lines:
|
45 |
-
line = line.replace(
|
46 |
try:
|
47 |
if " == " in line:
|
48 |
splitLine = line.split(" == ")
|
49 |
-
|
50 |
splitLine = line.split("==")
|
51 |
requirements_set.add(splitLine[0])
|
52 |
except:
|
@@ -132,8 +148,7 @@ def extract_information(repos, headers=None):
|
|
132 |
try:
|
133 |
file_content = tar.extractfile(member).read().decode("utf-8")
|
134 |
# extract readme
|
135 |
-
readmes_set =
|
136 |
-
readmes_set.add(file_content)
|
137 |
repo_info["readmes"].update(readmes_set)
|
138 |
except UnicodeDecodeError as e:
|
139 |
tqdm.write(
|
@@ -144,9 +159,9 @@ def extract_information(repos, headers=None):
|
|
144 |
# 4. Extracting requirements.
|
145 |
elif member.name.endswith("requirements.txt") and member.isfile():
|
146 |
try:
|
147 |
-
|
148 |
# extract readme
|
149 |
-
requirements_set = extract_requirements(
|
150 |
repo_info["requirements"].update(requirements_set)
|
151 |
except UnicodeDecodeError as e:
|
152 |
tqdm.write(
|
|
|
34 |
return code_set, docs_set
|
35 |
|
36 |
|
37 |
+
def extract_readmes(file_content):
|
38 |
+
"""
|
39 |
+
The method for extracting readmes.
|
40 |
+
:param lines: readmes.
|
41 |
+
:return: readme sentences.
|
42 |
+
"""
|
43 |
+
readmes_set = set()
|
44 |
+
lines = file_content.split('\n')
|
45 |
+
for line in lines:
|
46 |
+
line = line.replace("\n", "").strip()
|
47 |
+
readmes_set.add(line)
|
48 |
+
|
49 |
+
return readmes_set
|
50 |
+
|
51 |
+
|
52 |
+
def extract_requirements(file_content):
|
53 |
"""
|
54 |
The method for extracting requirements.
|
55 |
:param lines: requirements.
|
56 |
:return: requirement libraries.
|
57 |
"""
|
58 |
requirements_set = set()
|
59 |
+
lines = file_content.split('\n')
|
60 |
for line in lines:
|
61 |
+
line = line.replace("\n", "").strip()
|
62 |
try:
|
63 |
if " == " in line:
|
64 |
splitLine = line.split(" == ")
|
65 |
+
else:
|
66 |
splitLine = line.split("==")
|
67 |
requirements_set.add(splitLine[0])
|
68 |
except:
|
|
|
148 |
try:
|
149 |
file_content = tar.extractfile(member).read().decode("utf-8")
|
150 |
# extract readme
|
151 |
+
readmes_set = extract_readmes(file_content)
|
|
|
152 |
repo_info["readmes"].update(readmes_set)
|
153 |
except UnicodeDecodeError as e:
|
154 |
tqdm.write(
|
|
|
159 |
# 4. Extracting requirements.
|
160 |
elif member.name.endswith("requirements.txt") and member.isfile():
|
161 |
try:
|
162 |
+
file_content = tar.extractfile(member).read().decode("utf-8")
|
163 |
# extract readme
|
164 |
+
requirements_set = extract_requirements(file_content)
|
165 |
repo_info["requirements"].update(requirements_set)
|
166 |
except UnicodeDecodeError as e:
|
167 |
tqdm.write(
|