first version

Files changed (15) hide show

.gitattributes +1 -34
.gitignore +1 -0
.mise.toml +21 -0
README.md +90 -0
data/eval.csv +0 -0
data/train.csv +0 -0
embed.py +34 -0
model/model.pickle +3 -0
out/confusion_matrix.png +0 -0
out/preds.csv +45 -0
out/roc_curve.png +0 -0
requirements.dev.txt +1 -0
requirements.txt +6 -0
ruff.toml +2 -0
train.py +120 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
 *.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ *.csv filter=lfs diff=lfs merge=lfs -text


















2	*.pickle filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

.mise.toml ADDED Viewed

	@@ -0,0 +1,21 @@

+[env]
+_.python.venv = ".venv"
+EMBEDDING_MODEL = "Snowflake/snowflake-arctic-embed-xs"
+EMBEDDING_MODEL_REV = "d8c86521100d3556476a063fc2342036d45c106f"
+DATA_DIR = "{{config_root}}/data"
+MODEL_DIR = "{{config_root}}/model"
+OUT_DIR = "{{config_root}}/out"
+[tasks.deps]
+run = [
+  "uv pip install -r {{config_root}}/requirements.txt",
+  "uv pip install -r {{config_root}}/requirements.dev.txt",
+]
+[tasks."code:fmt"]
+run = "ruff format {{config_root}}"
+[tasks."code:lint"]
+run = "ruff check --fix {{config_root}}"

README.md ADDED Viewed

	@@ -0,0 +1,90 @@

+# 🧠 Article Relevance Classifier (Prototype)
+This project aims to classify news articles as **relevant** (i.e., discussing a new event) or **non-relevant**. The articles are then provided to an LLM pipeline. We should maximize the lowest false positive rate as we don't want the LLMs to be polluted.
+## 🧾 Available Features
+For each article, we collect a set of features from both the metadata and the raw content of the web page:
+- **Metadata Title**: The `<title>` tag of the page, often used by browsers and search engines.
+- **Metadata Description**: The `<meta name="description">` field, typically summarizing the article content.
+- **Content**: The main textual content of the article, extracted using [trafilatura](https://github.com/adbar/trafilatura).
+- **Date**: The publication date of the article (when available).
+- **CSS Title**: A title found in the visible content, usually marked with large or header-style HTML tags (e.g., `<h1>`).
+- **og:type**: The Open Graph `og:type` property, which often indicates the type of content (e.g., `article`, `video`, `website`).
+- **Text-to-HTML Ratio**: The ratio between the length of the extracted text and the total HTML size, indicating how content-focused the page is.
+- **Paragraph Count**: The number of `<p>` tags, giving a rough idea of how much structured text the page contains.
+- **Link Count**: The total number of hyperlinks (`<a>` tags) on the page.
+- **Weekday**: The day of the week the article was published, which can help identify publishing patterns.
+- **Average Link Count of the Website**: The average number of hyperlinks per page across the entire website domain. This helps differentiate content-heavy domains from link-heavy or index-style sites.
+➡️ The **Link Count** feature becomes more meaningful when **compared to the Average Link Count of the Website**. For example, a page with very few links on a generally link-heavy site may indicate that it is an article rather than a hub or landing page.
+⚠️ However, **computing the Average Link Count of the Website requires crawling multiple pages of the same domain**, which is not feasible in a real-time prediction setting (i.e., when you want to classify a single article instantly). For this reason, features like **Average Link Count of the Website** can only be used during offline training and are not available at inference time.
+## 🔍 Approach
+For this first prototype, I decided to use **text embeddings** to semantically represent each article. These embeddings are then used to train a binary classifier.
+Each article is represented using three components:
+- The **title** (from metadata, up to 512 tokens),
+- The **description** (from metadata, up to 512 tokens),
+- The **main content** (extracted using [trafilatura](https://github.com/adbar/trafilatura), up to 512 tokens).
+⚠️ Due to token limits, only the beginning of each text field is used. This may affect classification performance when relevant information appears later in the article.
+For the classifier, I chose a **Support Vector Machine (SVM)** model because:
+- **K-Nearest Neighbors (KNN)** is too slow at inference time due to the high dimensionality (512 × 3 features),
+- **Random Forests** risk overfitting when dealing with a large number of features,
+- **Logistic Regression** is a viable alternative, but SVMs generally perform better on high-dimensional, sparse datasets.
+### 📊 Results (Test 1)
+Below is the confusion matrix for the first test:
+![Confusion Matrix](images/confusion_matrix_test_1.png)
+- **Accuracy**: (15 + 18) / (15 + 7 + 4 + 18) = **72.5%**
+- **Precision (Relevant)**: 18 / (18 + 7) = **72.0%**
+- **Recall (Relevant)**: 18 / (18 + 4) = **81.8%**
+These initial results suggest the model can already capture some meaningful signals, although there is room for improvement, especially in reducing false positives.
+## 🧪 Second Test: Chunked Embeddings with Averaging
+To address the limitations of the first test, a second experiment was conducted using **chunked embeddings**:
+- Instead of truncating the text at 512 tokens, each field (title, description, content) is **split into multiple chunks** of up to 512 tokens.
+- Each chunk is embedded separately.
+- The final representation is the **average of all the chunk embeddings**.
+This method allows the model to consider a **broader portion of the article**, potentially capturing relevant information that appears later in the text.
+The goal of this second test is to evaluate whether this approach improves classification performance compared to the truncated version.
+### 📊 Results (Test 2)
+Below is the confusion matrix for the second test:
+![Confusion Matrix](images/confusion_matrix_test_2.png)
+- **Accuracy**: (16 + 18) / (16 + 6 + 4 + 18) = **77.3%**
+- **Precision (Relevant)**: 18 / (18 + 6) = **75.0%**
+- **Recall (Relevant)**: 18 / (18 + 4) = **81.8%**
+Compared to Test 1, this version shows a slight improvement in both **accuracy** and **precision**, indicating that including more of the article content via chunked embeddings helps reduce false positives and better capture relevant information.
+#### 📈 ROC Curve
+For this test, I also generated the ROC curve:
+![ROC Curve](images/roc_curve_test_2.png)
+The curve appears to have a stepped shape, which is expected due to the **limited number of test samples**. As a result, it’s difficult to draw strong conclusions from the ROC curve alone.
+However, we may tentatively observe that **lowering the decision threshold** could help reduce false positives — a promising direction to explore in future experiments with more data.
+#### Analysis of the results
+After analyzing the results, it seems that the model has difficulty distinguishing between the content types of articles, specifically whether they are news or not. However, it excels at identifying the structural layout of pages, such as determining if a page is a homepage, article, video, etc. Therefore, adding features like og:type, text-to-HTML ratio, and paragraph count may not be beneficial, as these features are primarily useful for differentiating page structure rather than content type.

data/eval.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

embed.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import os
+from functools import cache
+from itertools import batched
+from typing import Generator, Iterator
+import numpy as np
+from numpy.typing import NDArray
+from sentence_transformers import SentenceTransformer
+def split(text: str, max_tokens: int = 512) -> Generator[str, None, None]:
+    # Naive approach - use opale internal chunking techniques (special tokens count)
+    words = text.split()
+    if not words:
+        return
+    for batch in batched(words, max_tokens // 2):  # Assuming 2 tokens per word
+        yield " ".join(batch)
+@cache
+def get_model():
+    return SentenceTransformer(
+        os.environ["EMBEDDING_MODEL"], revision=os.environ["EMBEDDING_MODEL_REV"]
+    )
+def embed(texts: Iterator[str], max_tokens: int = 512) -> NDArray:
+    res: list[NDArray] = []
+    for text in texts:
+        embeddings = get_model().encode(
+            list(split(text, max_tokens)), show_progress_bar=False
+        )
+        res.append(np.mean(embeddings, axis=0))
+    return np.array(res)

model/model.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eef2e93850a38b0e98b68ac0e0a32f4a67408e7327a7646ed9e96019c3dc7583
+size 5293480

out/confusion_matrix.png ADDED Viewed

out/preds.csv ADDED Viewed

	@@ -0,0 +1,45 @@

+url,is_news_article,prediction,is_prediction_correct
+https://quantumcomputingreport.com/quandela-launches-belenos-photonic-quantum-computer-with-doubling-of-qubit-count-and-4000x-power-increase/,true,true,true
+https://www.nqcc.ac.uk/,false,false,true
+https://quantumcomputingreport.com/qsensato-raises-e500k-560k-usd-to-advance-integrated-atomic-quantum-sensors-for-precision-sensing/,true,true,true
+https://quantumcomputingreport.com/zurich-instruments-and-rohde-schwarz-join-australias-national-quantum-computing-testbed-facility/,true,true,true
+https://quantumcomputingreport.com/hbku-launches-qatars-first-quantum-computing-laboratory-backed-by-10m-mod-grant/,true,true,true
+https://quantumcomputingreport.com/quantinuum-releases-%ce%bbambeq-gen-ii-for-scalable-interpretable-quantum-nlp/,true,false,false
+https://quantumcomputingreport.com/quobly-secures-e21m-23-7m-usd-to-industrialize-100-qubit-silicon-quantum-processor/,true,true,true
+https://quantumcomputingreport.com/semiqon-and-nanoacademic-partner-to-advance-silicon-spin-qubit-research-and-education/,true,true,true
+https://quantumcomputingreport.com/united-nations-itu-launches-quantum-for-good-to-align-innovation-with-global-impact/,true,false,false
+https://quantumcomputingreport.com/microsoft-adds-post-quantum-cryptography-to-windows-insider-builds-and-linux/,true,true,true
+https://www.nqcc.ac.uk/technology-and-research/our-research/,false,false,true
+https://quantumcomputingreport.com/podcast-with-scott-davis-ceo-and-co-founder-of-vescent/,false,false,true
+https://quantumzeitgeist.com/building-atoms-the-rise-of-nanotechnology-and-molecular-engineering/,false,true,false
+https://quantumzeitgeist.com/networked-services-technologies-applications-and-challenges-for-advanced-communication/,false,false,true
+https://quantumzeitgeist.com/amazon-braket-sdk-and-multi-platform-quantum-development/,false,true,false
+https://quantumzeitgeist.com/pennylane-and-quantum-machine-learning/,false,false,true
+https://quantumzeitgeist.com/quantum-physics-meets-spiritual-philosophy-exploring-the-intersection-of-string-theory-and-consciousness/,false,false,true
+https://quantumzeitgeist.com/quantum-computing-transforms-financial-derivatives-pricing-for-complex-options-and-risk-analysis/,false,true,false
+https://quantumzeitgeist.com/quantifying-quantum-correlations-in-symmetric-gaussian-states-with-universal-invariants/,true,false,false
+https://www.horseandhound.co.uk/news/horse-life-threatening-stomach-tumour-saved-pioneering-surgery-894298,true,true,true
+https://www.maddyness.com/2025/06/02/vivatech-startups-deals-annonces-ce-que-la-mission-french-tech-prevoit-pour-levenement/,false,false,true
+https://www.cbsnews.com/sanfrancisco/news/padel-a-fast-growing-sport-has-become-a-new-obsession-for-silicon-valley/,false,true,false
+https://www.cloudcomputing-news.net/news/microsoft-launches-its-first-cloud-region-in-malaysia/,true,true,true
+https://padelmagazine.fr/best-padel-racket-awards-2025-les-meilleures-raquettes-de-lannee-devoilees/,false,false,true
+https://www.horseandhound.co.uk/news/polly-dickson-obituary-894506,true,true,true
+https://www.homeselect.paris/en/blog/devenir-proprietaire,false,false,true
+https://www.maddyness.com/2020/10/23/salomon-aiach-interview-facebook-startups/,false,false,true
+https://www.solarpowerportal.co.uk/grid-operators-must-work-together-in-aftermath-of-spain-and-portugal-blackout/,false,true,false
+https://www.cloudcomputing-news.net/news/podcast/nginx-f5-api-proxy-podcast-apac-sprint-two-point-one-podcast-s02-e30/,false,false,true
+https://www.farminguk.com/news/vegan-activists-attempt-to-shut-down-royal-highland-parade_66662.html,true,true,true
+https://dairynews.today/news/world_milk_day_2025_health_innovation_and_sustainability_drive_india_s_milk_movement_9339211.html,false,true,false
+"https://lerail.com/news/95810-signature-du-second-appel-%C3%A0-projets-gares-de-demain-entre-la-r%C3%A9gion-%C3%AEle-de-france,-%C3%AEle-de-france-mobilit%C3%A9s-et-sncf-gares-connexions",true,false,false
+https://lerail.com/news/95984-drive-to-zero-2025,false,false,true
+https://www.horseandhound.co.uk/news/farewell-to-twinshock-warrior-894106,true,true,true
+https://www.farminguk.com/news/new-ai-driven-test-targets-silent-killer-in-uk-cattle_66604.html,true,true,true
+https://www.maddyness.com/2019/05/02/growthhacking-chahab-nastar-scaleups/,false,false,true
+https://www.businesstravelnews.com/Lodging/Hyatt-Creates-New-Unscripted-Collection-Brand,true,false,false
+https://meuble-info.fr/falmec-gessi-le-duo-gagnant-du-point-deau/,true,false,false
+https://www.cloudcomputing-news.net/news/podcast/supply-chain-automation-warehousing-distribution-rpa-best-dematic-podcast-s03-e10/,false,false,true
+https://www.maddyness.com/2025/05/06/mon-petit-placement-tombe-dans-le-giron-de-malakoff-humanis/,true,false,false
+https://lerail.com/technical-articles/79770-southco-s%C3%A9curisation-du-v%C3%A9hicule-%C3%A9lectrique-infrastructure-de-recharge-et-de-stockage-sur-batterie-de-r%C3%A9seau,false,false,true
+https://www.watches-news.com/alpine-eagle-41-xp-cs-platinum/,true,true,true
+https://www.imarcgroup.com/football-market,false,true,false
+https://www.constructionnews.co.uk/contractors/balfour-beatty/balfour-beatty-court-battle-over-serious-trucks-cartel-ends-17-01-2025/,true,true,true

out/roc_curve.png ADDED Viewed

requirements.dev.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ python-lsp-server

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+matplotlib
+numpy
+polars
+scikit-learn
+seaborn
+sentence-transformers

ruff.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [lint]
2	+ extend-select = ["I"]

train.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import logging
+import os
+import pickle
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+import polars as pl
+import seaborn as sns
+from numpy.typing import NDArray
+from polars import DataFrame
+from sklearn.metrics import auc, confusion_matrix, roc_curve
+from sklearn.svm import SVC
+from embed import embed as _embed
+logger = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
+DATA = Path(os.environ["DATA_DIR"])
+DATA.mkdir(parents=True, exist_ok=True)
+MODEL = Path(os.environ["MODEL_DIR"])
+MODEL.mkdir(parents=True, exist_ok=True)
+OUT = Path(os.environ["OUT_DIR"])
+OUT.mkdir(parents=True, exist_ok=True)
+def embed(df: DataFrame):
+    logger.info(f"embed start {df.height}")
+    features = ["content", "meta_title", "meta_description"]
+    embeddings = []
+    for col in features:
+        train_texts = df.select(col).to_series().to_list()
+        embeddings.append(_embed(train_texts))
+    res = np.hstack(embeddings)
+    logger.info(f"embed done {res.shape}")
+    return res
+def train(df: DataFrame, target: str):
+    logger.info(f"train start {df.height}")
+    X = embed(df)
+    y = df.select(target).to_numpy().ravel()
+    clf = SVC(kernel="linear", probability=True)
+    clf.fit(X, y)
+    logger.info("train done")
+    return clf
+def save_prediction(eval_df: DataFrame, y_eval: NDArray, y_pred: NDArray) -> None:
+    pl.DataFrame(
+        {
+            "url": eval_df.select("url").to_series().to_list(),
+            "is_news_article": y_eval,
+            "prediction": y_pred,
+            "is_prediction_correct": y_eval == y_pred,
+        }
+    ).write_csv(OUT / "preds.csv")
+def save_roc_curve(clf, X: NDArray, y: NDArray):
+    probs = clf.predict_proba(X)[:, 1]  # Probability for the positive class
+    fpr, tpr, thresholds = roc_curve(y, probs)
+    roc_auc = auc(fpr, tpr)
+    plt.figure(figsize=(6, 5))
+    plt.plot(
+        fpr, tpr, color="darkorange", lw=2, label=f"ROC curve (AUC = {roc_auc:.2f})"
+    )
+    plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel("False Positive Rate")
+    plt.ylabel("True Positive Rate")
+    plt.title("Receiver Operating Characteristic (ROC)")
+    plt.legend(loc="lower right")
+    plt.tight_layout()
+    plt.savefig(OUT / "roc_curve.png")
+    plt.close()
+def save_confusion_matrix(y: NDArray, pred: NDArray):
+    plt.figure(figsize=(5, 4))
+    sns.heatmap(
+        confusion_matrix(y, pred),
+        annot=True,
+        fmt="d",
+        cmap="Blues",
+        xticklabels=["Not Relevant", "Relevant"],
+        yticklabels=["Not Relevant", "Relevant"],
+    )
+    plt.xlabel("Predicted")
+    plt.ylabel("Actual")
+    plt.title("Confusion Matrix")
+    plt.tight_layout()
+    plt.savefig(OUT / "confusion_matrix.png")
+    plt.close()
+def main() -> None:
+    target = "is_news_article"
+    train_df = pl.read_csv(DATA / "train.csv")
+    clf = train(train_df, target)
+    with open(MODEL / "model.pickle", "wb") as f:
+        pickle.dump(clf, f)
+    eval_df = pl.read_csv(DATA / "eval.csv")
+    logger.info(f"eval start {eval_df.height}")
+    eval_X = embed(eval_df)
+    eval_y = eval_df.select(target).to_numpy().ravel()
+    eval_pred = clf.predict(eval_X)
+    save_prediction(eval_df, eval_y, eval_pred)
+    save_confusion_matrix(eval_y, eval_pred)
+    save_roc_curve(clf, eval_X, eval_y)
+    logger.info("eval done")
+if __name__ == "__main__":
+    main()