Upload folder using huggingface_hub
Browse files- .gitattributes +4 -0
- gen_ans.py +61 -0
- gen_ans2.py +60 -0
- load.py +13 -0
- qa_pairs_all-alpha1b-final.json +3 -0
- qa_pairs_all-alpha1b.json +3 -0
- qa_pairs_all-alpha1b_2.json +3 -0
- qa_pairs_all-qwen7b.json +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
qa_pairs_all-alpha1b-final.json filter=lfs diff=lfs merge=lfs -text
|
37 |
+
qa_pairs_all-alpha1b.json filter=lfs diff=lfs merge=lfs -text
|
38 |
+
qa_pairs_all-alpha1b_2.json filter=lfs diff=lfs merge=lfs -text
|
39 |
+
qa_pairs_all-qwen7b.json filter=lfs diff=lfs merge=lfs -text
|
gen_ans.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from openai import AsyncOpenAI
|
3 |
+
from tqdm import tqdm
|
4 |
+
import asyncio
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
|
8 |
+
client = AsyncOpenAI(api_key="no-need", base_url="http://localhost:8000/v1")
|
9 |
+
|
10 |
+
async def generate_answer(messages):
|
11 |
+
try:
|
12 |
+
response = await client.chat.completions.create(
|
13 |
+
model="outputs/out-alpha",
|
14 |
+
messages=messages,
|
15 |
+
max_tokens=2048,
|
16 |
+
)
|
17 |
+
return response.choices[0].message.content
|
18 |
+
except Exception as e:
|
19 |
+
return 'error'
|
20 |
+
|
21 |
+
async def process_batch(questions, batch_num, all_qa_pairs):
|
22 |
+
tasks = []
|
23 |
+
for q in tqdm(questions, desc=f"Batch {batch_num}", leave=False):
|
24 |
+
tasks.append(generate_answer(q))
|
25 |
+
answers = await asyncio.gather(*tasks)
|
26 |
+
|
27 |
+
# Create list of question-answer pairs and append to existing pairs
|
28 |
+
for q, a in zip(questions, answers):
|
29 |
+
q.append({'role': 'assistant', 'content': a})
|
30 |
+
all_qa_pairs.append({"conversations": q})
|
31 |
+
|
32 |
+
# Save all results after each batch
|
33 |
+
with open('qa_pairs_all-alpha1b_2.json', 'w') as f:
|
34 |
+
json.dump(all_qa_pairs, f, indent=2)
|
35 |
+
|
36 |
+
return answers
|
37 |
+
|
38 |
+
async def main():
|
39 |
+
dataset = load_dataset('qnguyen3/sft-r1')
|
40 |
+
# Load existing QA pairs if file exists
|
41 |
+
all_qa_pairs = []
|
42 |
+
if os.path.exists('qa_pairs_all-alpha1b_2.json'):
|
43 |
+
with open('qa_pairs_all-alpha1b_2.json', 'r') as f:
|
44 |
+
all_qa_pairs = json.load(f)
|
45 |
+
|
46 |
+
# Prepare questions
|
47 |
+
question_list = []
|
48 |
+
print("Preparing questions...")
|
49 |
+
for i, item in tqdm(enumerate(dataset['train']), desc="Loading dataset"):
|
50 |
+
if i >= 21600:
|
51 |
+
question_list.append(item['messages'][:-1])
|
52 |
+
|
53 |
+
# Process in batches of 200
|
54 |
+
batch_size = 200
|
55 |
+
for i in tqdm(range(0, len(question_list), batch_size), desc="Processing batches"):
|
56 |
+
batch_questions = question_list[i:i+batch_size]
|
57 |
+
batch_num = i // batch_size
|
58 |
+
await process_batch(batch_questions, batch_num, all_qa_pairs)
|
59 |
+
|
60 |
+
if __name__ == "__main__":
|
61 |
+
asyncio.run(main())
|
gen_ans2.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from openai import AsyncOpenAI
|
3 |
+
from tqdm import tqdm
|
4 |
+
import asyncio
|
5 |
+
import json
|
6 |
+
import os
|
7 |
+
|
8 |
+
client = AsyncOpenAI(api_key="no-need", base_url="http://147.185.42.13:24585/v1")
|
9 |
+
|
10 |
+
async def generate_answer(messages):
|
11 |
+
try:
|
12 |
+
response = await client.chat.completions.create(
|
13 |
+
model="Qwen/Qwen2.5-7B-Instruct",
|
14 |
+
messages=messages,
|
15 |
+
max_tokens=2048,
|
16 |
+
)
|
17 |
+
return response.choices[0].message.content
|
18 |
+
except Exception as e:
|
19 |
+
return 'error'
|
20 |
+
|
21 |
+
async def process_batch(questions, batch_num, all_qa_pairs):
|
22 |
+
tasks = []
|
23 |
+
for q in tqdm(questions, desc=f"Batch {batch_num}", leave=False):
|
24 |
+
tasks.append(generate_answer(q))
|
25 |
+
answers = await asyncio.gather(*tasks)
|
26 |
+
|
27 |
+
# Create list of question-answer pairs and append to existing pairs
|
28 |
+
for q, a in zip(questions, answers):
|
29 |
+
q.append({'role': 'assistant', 'content': a})
|
30 |
+
all_qa_pairs.append({"conversations": q})
|
31 |
+
|
32 |
+
# Save all results after each batch
|
33 |
+
with open('qa_pairs_all-qwen7b.json', 'w') as f:
|
34 |
+
json.dump(all_qa_pairs, f, indent=2)
|
35 |
+
|
36 |
+
return answers
|
37 |
+
|
38 |
+
async def main():
|
39 |
+
dataset = load_dataset('qnguyen3/sft-r1')
|
40 |
+
# Load existing QA pairs if file exists
|
41 |
+
all_qa_pairs = []
|
42 |
+
if os.path.exists('qa_pairs_all-qwen7b.json'):
|
43 |
+
with open('qa_pairs_all-qwen7b.json', 'r') as f:
|
44 |
+
all_qa_pairs = json.load(f)
|
45 |
+
|
46 |
+
# Prepare questions
|
47 |
+
question_list = []
|
48 |
+
print("Preparing questions...")
|
49 |
+
for i, item in tqdm(enumerate(dataset['train']), desc="Loading dataset"):
|
50 |
+
question_list.append(item['messages'][:-1])
|
51 |
+
|
52 |
+
# Process in batches of 200
|
53 |
+
batch_size = 200
|
54 |
+
for i in tqdm(range(0, len(question_list), batch_size), desc="Processing batches"):
|
55 |
+
batch_questions = question_list[i:i+batch_size]
|
56 |
+
batch_num = i // batch_size
|
57 |
+
await process_batch(batch_questions, batch_num, all_qa_pairs)
|
58 |
+
|
59 |
+
if __name__ == "__main__":
|
60 |
+
asyncio.run(main())
|
load.py
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### read qa_pairs_all-alpha1b.json
|
2 |
+
import json
|
3 |
+
|
4 |
+
with open('qa_pairs_all-alpha1b.json', 'r') as f:
|
5 |
+
data = json.load(f)
|
6 |
+
|
7 |
+
with open('qa_pairs_all-alpha1b_2.json', 'r') as f:
|
8 |
+
data2 = json.load(f)
|
9 |
+
|
10 |
+
### concat and save as qa_pairs_all-alpha1b_3.json
|
11 |
+
data.extend(data2)
|
12 |
+
with open('qa_pairs_all-alpha1b-final.json', 'w') as f:
|
13 |
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
qa_pairs_all-alpha1b-final.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b172c524b52bf65c665d3a80965138d96358478c954c3c7c9a5bd9c77ec92458
|
3 |
+
size 132102846
|
qa_pairs_all-alpha1b.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9daa5f71e42042eb06d7caa3a2b6d5fa216458700ef56f9759b1bb88eb9f9e7d
|
3 |
+
size 52337224
|
qa_pairs_all-alpha1b_2.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55782571d4178a1f2817166820f271f7afef5a02370370103e5e7602afc77411
|
3 |
+
size 80234968
|
qa_pairs_all-qwen7b.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:356685dd6959ef2bc769a93ead26ce5e3f521d731c7e2835351177f1009c25e5
|
3 |
+
size 166879355
|