Mini-LLM / data /raw /extract_all.py
Ashx098's picture
Upload folder using huggingface_hub
f4e346e verified
import os
import pyarrow.parquet as pq
from glob import glob
from tqdm import tqdm
INPUT_DIRS = [
"books",
"fineweb",
"wikipedia",
]
OUTPUT_DIR = "merged_text"
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUT_FILE = os.path.join(OUTPUT_DIR, "corpus.txt")
def extract_text_from_parquet(path):
try:
table = pq.read_table(path)
df = table.to_pandas()
# Look for likely text column
for col in ["text", "content", "document", "article", "source"]:
if col in df.columns:
return df[col].astype(str).tolist()
# Fallback: take the first string-like column
for col in df.columns:
if df[col].dtype == object:
return df[col].astype(str).tolist()
return []
except Exception as e:
print(f"Error reading {path}: {e}")
return []
all_parquet_files = []
for d in INPUT_DIRS:
all_parquet_files.extend(glob(f"{d}/**/*.parquet", recursive=True))
print("Total parquet files found:", len(all_parquet_files))
with open(OUT_FILE, "w", encoding="utf-8") as fout:
for file in tqdm(all_parquet_files, desc="Extracting text"):
texts = extract_text_from_parquet(file)
for t in texts:
t = t.strip()
if len(t) < 50:
continue
if not any(c.isalpha() for c in t):
continue
fout.write(t + "\n\n")
print("DONE! Saved merged corpus →", OUT_FILE)