qnguyen3 commited on
Commit
22f338c
·
verified ·
1 Parent(s): 4f55a61

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ qa_pairs_all-alpha1b-final.json filter=lfs diff=lfs merge=lfs -text
37
+ qa_pairs_all-alpha1b.json filter=lfs diff=lfs merge=lfs -text
38
+ qa_pairs_all-alpha1b_2.json filter=lfs diff=lfs merge=lfs -text
39
+ qa_pairs_all-qwen7b.json filter=lfs diff=lfs merge=lfs -text
gen_ans.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from openai import AsyncOpenAI
3
+ from tqdm import tqdm
4
+ import asyncio
5
+ import json
6
+ import os
7
+
8
+ client = AsyncOpenAI(api_key="no-need", base_url="http://localhost:8000/v1")
9
+
10
+ async def generate_answer(messages):
11
+ try:
12
+ response = await client.chat.completions.create(
13
+ model="outputs/out-alpha",
14
+ messages=messages,
15
+ max_tokens=2048,
16
+ )
17
+ return response.choices[0].message.content
18
+ except Exception as e:
19
+ return 'error'
20
+
21
+ async def process_batch(questions, batch_num, all_qa_pairs):
22
+ tasks = []
23
+ for q in tqdm(questions, desc=f"Batch {batch_num}", leave=False):
24
+ tasks.append(generate_answer(q))
25
+ answers = await asyncio.gather(*tasks)
26
+
27
+ # Create list of question-answer pairs and append to existing pairs
28
+ for q, a in zip(questions, answers):
29
+ q.append({'role': 'assistant', 'content': a})
30
+ all_qa_pairs.append({"conversations": q})
31
+
32
+ # Save all results after each batch
33
+ with open('qa_pairs_all-alpha1b_2.json', 'w') as f:
34
+ json.dump(all_qa_pairs, f, indent=2)
35
+
36
+ return answers
37
+
38
+ async def main():
39
+ dataset = load_dataset('qnguyen3/sft-r1')
40
+ # Load existing QA pairs if file exists
41
+ all_qa_pairs = []
42
+ if os.path.exists('qa_pairs_all-alpha1b_2.json'):
43
+ with open('qa_pairs_all-alpha1b_2.json', 'r') as f:
44
+ all_qa_pairs = json.load(f)
45
+
46
+ # Prepare questions
47
+ question_list = []
48
+ print("Preparing questions...")
49
+ for i, item in tqdm(enumerate(dataset['train']), desc="Loading dataset"):
50
+ if i >= 21600:
51
+ question_list.append(item['messages'][:-1])
52
+
53
+ # Process in batches of 200
54
+ batch_size = 200
55
+ for i in tqdm(range(0, len(question_list), batch_size), desc="Processing batches"):
56
+ batch_questions = question_list[i:i+batch_size]
57
+ batch_num = i // batch_size
58
+ await process_batch(batch_questions, batch_num, all_qa_pairs)
59
+
60
+ if __name__ == "__main__":
61
+ asyncio.run(main())
gen_ans2.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from openai import AsyncOpenAI
3
+ from tqdm import tqdm
4
+ import asyncio
5
+ import json
6
+ import os
7
+
8
+ client = AsyncOpenAI(api_key="no-need", base_url="http://147.185.42.13:24585/v1")
9
+
10
+ async def generate_answer(messages):
11
+ try:
12
+ response = await client.chat.completions.create(
13
+ model="Qwen/Qwen2.5-7B-Instruct",
14
+ messages=messages,
15
+ max_tokens=2048,
16
+ )
17
+ return response.choices[0].message.content
18
+ except Exception as e:
19
+ return 'error'
20
+
21
+ async def process_batch(questions, batch_num, all_qa_pairs):
22
+ tasks = []
23
+ for q in tqdm(questions, desc=f"Batch {batch_num}", leave=False):
24
+ tasks.append(generate_answer(q))
25
+ answers = await asyncio.gather(*tasks)
26
+
27
+ # Create list of question-answer pairs and append to existing pairs
28
+ for q, a in zip(questions, answers):
29
+ q.append({'role': 'assistant', 'content': a})
30
+ all_qa_pairs.append({"conversations": q})
31
+
32
+ # Save all results after each batch
33
+ with open('qa_pairs_all-qwen7b.json', 'w') as f:
34
+ json.dump(all_qa_pairs, f, indent=2)
35
+
36
+ return answers
37
+
38
+ async def main():
39
+ dataset = load_dataset('qnguyen3/sft-r1')
40
+ # Load existing QA pairs if file exists
41
+ all_qa_pairs = []
42
+ if os.path.exists('qa_pairs_all-qwen7b.json'):
43
+ with open('qa_pairs_all-qwen7b.json', 'r') as f:
44
+ all_qa_pairs = json.load(f)
45
+
46
+ # Prepare questions
47
+ question_list = []
48
+ print("Preparing questions...")
49
+ for i, item in tqdm(enumerate(dataset['train']), desc="Loading dataset"):
50
+ question_list.append(item['messages'][:-1])
51
+
52
+ # Process in batches of 200
53
+ batch_size = 200
54
+ for i in tqdm(range(0, len(question_list), batch_size), desc="Processing batches"):
55
+ batch_questions = question_list[i:i+batch_size]
56
+ batch_num = i // batch_size
57
+ await process_batch(batch_questions, batch_num, all_qa_pairs)
58
+
59
+ if __name__ == "__main__":
60
+ asyncio.run(main())
load.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### read qa_pairs_all-alpha1b.json
2
+ import json
3
+
4
+ with open('qa_pairs_all-alpha1b.json', 'r') as f:
5
+ data = json.load(f)
6
+
7
+ with open('qa_pairs_all-alpha1b_2.json', 'r') as f:
8
+ data2 = json.load(f)
9
+
10
+ ### concat and save as qa_pairs_all-alpha1b_3.json
11
+ data.extend(data2)
12
+ with open('qa_pairs_all-alpha1b-final.json', 'w') as f:
13
+ json.dump(data, f, indent=2, ensure_ascii=False)
qa_pairs_all-alpha1b-final.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b172c524b52bf65c665d3a80965138d96358478c954c3c7c9a5bd9c77ec92458
3
+ size 132102846
qa_pairs_all-alpha1b.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9daa5f71e42042eb06d7caa3a2b6d5fa216458700ef56f9759b1bb88eb9f9e7d
3
+ size 52337224
qa_pairs_all-alpha1b_2.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55782571d4178a1f2817166820f271f7afef5a02370370103e5e7602afc77411
3
+ size 80234968
qa_pairs_all-qwen7b.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:356685dd6959ef2bc769a93ead26ce5e3f521d731c7e2835351177f1009c25e5
3
+ size 166879355