Ali Kefia commited on
Commit
4c31c97
·
1 Parent(s): 231da5b
.gitattributes CHANGED
@@ -1,2 +1,3 @@
1
  *.csv filter=lfs diff=lfs merge=lfs -text
 
2
  *.pickle filter=lfs diff=lfs merge=lfs -text
 
1
  *.csv filter=lfs diff=lfs merge=lfs -text
2
+ *.parquet filter=lfs diff=lfs merge=lfs -text
3
  *.pickle filter=lfs diff=lfs merge=lfs -text
.mise.toml CHANGED
@@ -6,7 +6,7 @@ EMBEDDING_MODEL_REV = "d8c86521100d3556476a063fc2342036d45c106f"
6
 
7
  DATA_DIR = "{{config_root}}/data"
8
  MODEL_DIR = "{{config_root}}/model"
9
- OUT_DIR = "{{config_root}}/out"
10
 
11
  [tasks.deps]
12
  run = [
 
6
 
7
  DATA_DIR = "{{config_root}}/data"
8
  MODEL_DIR = "{{config_root}}/model"
9
+ IMGS_DIR = "{{config_root}}/imgs"
10
 
11
  [tasks.deps]
12
  run = [
data/eval.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ce81584baeb7eb8ca4322bc0f50af105ae3795229718cda1dfa1f600e945f3a
3
+ size 195251
data/train.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bf199fc047485c2c453c4d9b80714261ed58152ef34c59903a64f9725d0e4956
3
+ size 6608000
debug.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import polars as pl
2
+
3
+ from utils.paths import DATA
4
+
5
+
6
+ def main() -> None:
7
+ for name in ["train", "eval"]:
8
+ df = pl.read_parquet(DATA / (name + ".parquet"))
9
+ print(df)
10
+
11
+
12
+ if __name__ == "__main__":
13
+ main()
imgs/confusion_matrix.png ADDED
imgs/roc_curve.png ADDED
model/model.pickle CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eef2e93850a38b0e98b68ac0e0a32f4a67408e7327a7646ed9e96019c3dc7583
3
- size 5293480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73bf71607b6b99d8576a79ec96cdf97e008134e7d348477f93b8cdcf057db19e
3
+ size 3411728
out/confusion_matrix.png DELETED
Binary file (16.4 kB)
 
out/preds.csv DELETED
@@ -1,45 +0,0 @@
1
- url,is_news_article,prediction,is_prediction_correct
2
- https://quantumcomputingreport.com/quandela-launches-belenos-photonic-quantum-computer-with-doubling-of-qubit-count-and-4000x-power-increase/,true,true,true
3
- https://www.nqcc.ac.uk/,false,false,true
4
- https://quantumcomputingreport.com/qsensato-raises-e500k-560k-usd-to-advance-integrated-atomic-quantum-sensors-for-precision-sensing/,true,true,true
5
- https://quantumcomputingreport.com/zurich-instruments-and-rohde-schwarz-join-australias-national-quantum-computing-testbed-facility/,true,true,true
6
- https://quantumcomputingreport.com/hbku-launches-qatars-first-quantum-computing-laboratory-backed-by-10m-mod-grant/,true,true,true
7
- https://quantumcomputingreport.com/quantinuum-releases-%ce%bbambeq-gen-ii-for-scalable-interpretable-quantum-nlp/,true,false,false
8
- https://quantumcomputingreport.com/quobly-secures-e21m-23-7m-usd-to-industrialize-100-qubit-silicon-quantum-processor/,true,true,true
9
- https://quantumcomputingreport.com/semiqon-and-nanoacademic-partner-to-advance-silicon-spin-qubit-research-and-education/,true,true,true
10
- https://quantumcomputingreport.com/united-nations-itu-launches-quantum-for-good-to-align-innovation-with-global-impact/,true,false,false
11
- https://quantumcomputingreport.com/microsoft-adds-post-quantum-cryptography-to-windows-insider-builds-and-linux/,true,true,true
12
- https://www.nqcc.ac.uk/technology-and-research/our-research/,false,false,true
13
- https://quantumcomputingreport.com/podcast-with-scott-davis-ceo-and-co-founder-of-vescent/,false,false,true
14
- https://quantumzeitgeist.com/building-atoms-the-rise-of-nanotechnology-and-molecular-engineering/,false,true,false
15
- https://quantumzeitgeist.com/networked-services-technologies-applications-and-challenges-for-advanced-communication/,false,false,true
16
- https://quantumzeitgeist.com/amazon-braket-sdk-and-multi-platform-quantum-development/,false,true,false
17
- https://quantumzeitgeist.com/pennylane-and-quantum-machine-learning/,false,false,true
18
- https://quantumzeitgeist.com/quantum-physics-meets-spiritual-philosophy-exploring-the-intersection-of-string-theory-and-consciousness/,false,false,true
19
- https://quantumzeitgeist.com/quantum-computing-transforms-financial-derivatives-pricing-for-complex-options-and-risk-analysis/,false,true,false
20
- https://quantumzeitgeist.com/quantifying-quantum-correlations-in-symmetric-gaussian-states-with-universal-invariants/,true,false,false
21
- https://www.horseandhound.co.uk/news/horse-life-threatening-stomach-tumour-saved-pioneering-surgery-894298,true,true,true
22
- https://www.maddyness.com/2025/06/02/vivatech-startups-deals-annonces-ce-que-la-mission-french-tech-prevoit-pour-levenement/,false,false,true
23
- https://www.cbsnews.com/sanfrancisco/news/padel-a-fast-growing-sport-has-become-a-new-obsession-for-silicon-valley/,false,true,false
24
- https://www.cloudcomputing-news.net/news/microsoft-launches-its-first-cloud-region-in-malaysia/,true,true,true
25
- https://padelmagazine.fr/best-padel-racket-awards-2025-les-meilleures-raquettes-de-lannee-devoilees/,false,false,true
26
- https://www.horseandhound.co.uk/news/polly-dickson-obituary-894506,true,true,true
27
- https://www.homeselect.paris/en/blog/devenir-proprietaire,false,false,true
28
- https://www.maddyness.com/2020/10/23/salomon-aiach-interview-facebook-startups/,false,false,true
29
- https://www.solarpowerportal.co.uk/grid-operators-must-work-together-in-aftermath-of-spain-and-portugal-blackout/,false,true,false
30
- https://www.cloudcomputing-news.net/news/podcast/nginx-f5-api-proxy-podcast-apac-sprint-two-point-one-podcast-s02-e30/,false,false,true
31
- https://www.farminguk.com/news/vegan-activists-attempt-to-shut-down-royal-highland-parade_66662.html,true,true,true
32
- https://dairynews.today/news/world_milk_day_2025_health_innovation_and_sustainability_drive_india_s_milk_movement_9339211.html,false,true,false
33
- "https://lerail.com/news/95810-signature-du-second-appel-%C3%A0-projets-gares-de-demain-entre-la-r%C3%A9gion-%C3%AEle-de-france,-%C3%AEle-de-france-mobilit%C3%A9s-et-sncf-gares-connexions",true,false,false
34
- https://lerail.com/news/95984-drive-to-zero-2025,false,false,true
35
- https://www.horseandhound.co.uk/news/farewell-to-twinshock-warrior-894106,true,true,true
36
- https://www.farminguk.com/news/new-ai-driven-test-targets-silent-killer-in-uk-cattle_66604.html,true,true,true
37
- https://www.maddyness.com/2019/05/02/growthhacking-chahab-nastar-scaleups/,false,false,true
38
- https://www.businesstravelnews.com/Lodging/Hyatt-Creates-New-Unscripted-Collection-Brand,true,false,false
39
- https://meuble-info.fr/falmec-gessi-le-duo-gagnant-du-point-deau/,true,false,false
40
- https://www.cloudcomputing-news.net/news/podcast/supply-chain-automation-warehousing-distribution-rpa-best-dematic-podcast-s03-e10/,false,false,true
41
- https://www.maddyness.com/2025/05/06/mon-petit-placement-tombe-dans-le-giron-de-malakoff-humanis/,true,false,false
42
- https://lerail.com/technical-articles/79770-southco-s%C3%A9curisation-du-v%C3%A9hicule-%C3%A9lectrique-infrastructure-de-recharge-et-de-stockage-sur-batterie-de-r%C3%A9seau,false,false,true
43
- https://www.watches-news.com/alpine-eagle-41-xp-cs-platinum/,true,true,true
44
- https://www.imarcgroup.com/football-market,false,true,false
45
- https://www.constructionnews.co.uk/contractors/balfour-beatty/balfour-beatty-court-battle-over-serious-trucks-cartel-ends-17-01-2025/,true,true,true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
out/roc_curve.png DELETED
Binary file (29.3 kB)
 
prepare.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ import polars as pl
4
+
5
+ from utils.embed import embed as embed
6
+ from utils.paths import DATA
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+ logging.basicConfig(level=logging.INFO)
11
+
12
+
13
+ def load_dataset(file_name: str):
14
+ features = ["meta_title", "meta_description", "content"]
15
+ return (
16
+ pl.scan_csv(file_name)
17
+ .with_columns(
18
+ pl.concat_str([pl.col(c) for c in features], separator="\n\n").alias(
19
+ "text"
20
+ ),
21
+ pl.col("date").str.to_date().alias("date"),
22
+ )
23
+ .rename(
24
+ {
25
+ "is_news_article": "is_news",
26
+ "link_count": "links",
27
+ "paragraph_count": "paragraphs",
28
+ }
29
+ )
30
+ .select("text", "is_news", "url", "date", "paragraphs", "links")
31
+ .collect()
32
+ )
33
+
34
+
35
+ def main() -> None:
36
+ for name in ["train", "eval"]:
37
+ df = load_dataset(DATA / (name + ".csv"))
38
+ embeds = embed(df.get_column("text").to_list())
39
+ df = df.with_columns(pl.Series(embeds).alias("embeds")).write_parquet(
40
+ DATA / (name + ".parquet")
41
+ )
42
+
43
+
44
+ if __name__ == "__main__":
45
+ main()
train.py CHANGED
@@ -1,63 +1,17 @@
1
  import logging
2
- import os
3
  import pickle
4
- from pathlib import Path
5
 
6
  import matplotlib.pyplot as plt
7
- import numpy as np
8
  import polars as pl
9
  import seaborn as sns
10
  from numpy.typing import NDArray
11
- from polars import DataFrame
12
  from sklearn.metrics import auc, confusion_matrix, roc_curve
13
  from sklearn.svm import SVC
14
 
15
- from embed import embed as _embed
16
-
17
- logger = logging.getLogger(__name__)
18
 
19
  logging.basicConfig(level=logging.INFO)
20
 
21
- DATA = Path(os.environ["DATA_DIR"])
22
- DATA.mkdir(parents=True, exist_ok=True)
23
- MODEL = Path(os.environ["MODEL_DIR"])
24
- MODEL.mkdir(parents=True, exist_ok=True)
25
- OUT = Path(os.environ["OUT_DIR"])
26
- OUT.mkdir(parents=True, exist_ok=True)
27
-
28
-
29
- def embed(df: DataFrame):
30
- logger.info(f"embed start {df.height}")
31
- features = ["content", "meta_title", "meta_description"]
32
- embeddings = []
33
- for col in features:
34
- train_texts = df.select(col).to_series().to_list()
35
- embeddings.append(_embed(train_texts))
36
- res = np.hstack(embeddings)
37
- logger.info(f"embed done {res.shape}")
38
- return res
39
-
40
-
41
- def train(df: DataFrame, target: str):
42
- logger.info(f"train start {df.height}")
43
- X = embed(df)
44
- y = df.select(target).to_numpy().ravel()
45
- clf = SVC(kernel="linear", probability=True)
46
- clf.fit(X, y)
47
- logger.info("train done")
48
- return clf
49
-
50
-
51
- def save_prediction(eval_df: DataFrame, y_eval: NDArray, y_pred: NDArray) -> None:
52
- pl.DataFrame(
53
- {
54
- "url": eval_df.select("url").to_series().to_list(),
55
- "is_news_article": y_eval,
56
- "prediction": y_pred,
57
- "is_prediction_correct": y_eval == y_pred,
58
- }
59
- ).write_csv(OUT / "preds.csv")
60
-
61
 
62
  def save_roc_curve(clf, X: NDArray, y: NDArray):
63
  probs = clf.predict_proba(X)[:, 1] # Probability for the positive class
@@ -76,7 +30,7 @@ def save_roc_curve(clf, X: NDArray, y: NDArray):
76
  plt.title("Receiver Operating Characteristic (ROC)")
77
  plt.legend(loc="lower right")
78
  plt.tight_layout()
79
- plt.savefig(OUT / "roc_curve.png")
80
  plt.close()
81
 
82
 
@@ -94,26 +48,26 @@ def save_confusion_matrix(y: NDArray, pred: NDArray):
94
  plt.ylabel("Actual")
95
  plt.title("Confusion Matrix")
96
  plt.tight_layout()
97
- plt.savefig(OUT / "confusion_matrix.png")
98
  plt.close()
99
 
100
 
101
  def main() -> None:
102
- target = "is_news_article"
103
- train_df = pl.read_csv(DATA / "train.csv")
104
- clf = train(train_df, target)
 
 
 
105
  with open(MODEL / "model.pickle", "wb") as f:
106
  pickle.dump(clf, f)
107
 
108
- eval_df = pl.read_csv(DATA / "eval.csv")
109
- logger.info(f"eval start {eval_df.height}")
110
- eval_X = embed(eval_df)
111
- eval_y = eval_df.select(target).to_numpy().ravel()
112
  eval_pred = clf.predict(eval_X)
113
- save_prediction(eval_df, eval_y, eval_pred)
114
  save_confusion_matrix(eval_y, eval_pred)
115
  save_roc_curve(clf, eval_X, eval_y)
116
- logger.info("eval done")
117
 
118
 
119
  if __name__ == "__main__":
 
1
  import logging
 
2
  import pickle
 
3
 
4
  import matplotlib.pyplot as plt
 
5
  import polars as pl
6
  import seaborn as sns
7
  from numpy.typing import NDArray
 
8
  from sklearn.metrics import auc, confusion_matrix, roc_curve
9
  from sklearn.svm import SVC
10
 
11
+ from utils.paths import DATA, IMGS, MODEL
 
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def save_roc_curve(clf, X: NDArray, y: NDArray):
17
  probs = clf.predict_proba(X)[:, 1] # Probability for the positive class
 
30
  plt.title("Receiver Operating Characteristic (ROC)")
31
  plt.legend(loc="lower right")
32
  plt.tight_layout()
33
+ plt.savefig(IMGS / "roc_curve.png")
34
  plt.close()
35
 
36
 
 
48
  plt.ylabel("Actual")
49
  plt.title("Confusion Matrix")
50
  plt.tight_layout()
51
+ plt.savefig(IMGS / "confusion_matrix.png")
52
  plt.close()
53
 
54
 
55
  def main() -> None:
56
+ train_df = pl.read_parquet(DATA / "train.parquet")
57
+ clf = SVC(kernel="linear", probability=True)
58
+ clf.fit(
59
+ train_df.get_column("embeds").to_numpy(),
60
+ train_df.get_column("is_news").to_numpy(),
61
+ )
62
  with open(MODEL / "model.pickle", "wb") as f:
63
  pickle.dump(clf, f)
64
 
65
+ eval_df = pl.read_parquet(DATA / "eval.parquet")
66
+ eval_X = eval_df.get_column("embeds").to_numpy()
67
+ eval_y = eval_df.get_column("is_news").to_numpy()
 
68
  eval_pred = clf.predict(eval_X)
 
69
  save_confusion_matrix(eval_y, eval_pred)
70
  save_roc_curve(clf, eval_X, eval_y)
 
71
 
72
 
73
  if __name__ == "__main__":
usage.py CHANGED
@@ -1,45 +1,37 @@
1
- import os
2
  import pickle
3
  from functools import cache
4
- from pathlib import Path
5
 
6
- import numpy as np
7
  import polars as pl
8
  from huggingface_hub import hf_hub_download
9
 
10
- from embed import embed
11
-
12
- DATA = Path(os.environ["DATA_DIR"])
13
-
14
- features = ["content", "meta_title", "meta_description"]
15
 
16
 
17
  @cache
18
  def get_model():
19
- file_name = hf_hub_download("opale-ai/news-classifier", "model/model.pickle")
 
 
20
  with open(file_name, "rb") as f:
21
  return pickle.load(f)
22
 
23
 
24
- def record_get():
25
- df = pl.read_csv(DATA / "eval.csv")
26
- return {col: val for col, val in zip(df.columns, df.sample().row(0))}
27
-
28
-
29
- def record_embed(rec):
30
- embeddings = []
31
- for f in features:
32
- embeddings.append(embed([rec[f]]))
33
- return np.hstack(embeddings)
34
 
35
 
36
  def main():
37
  model = get_model()
38
- record = record_get()
39
- embeds = record_embed(record)
40
  (pred,) = model.predict(embeds)
41
- print(record["content"])
42
- print(f"is news (real): {record['is_news_article']}")
43
  print(f"is news (pred): {pred}")
44
 
45
 
 
 
1
  import pickle
2
  from functools import cache
 
3
 
 
4
  import polars as pl
5
  from huggingface_hub import hf_hub_download
6
 
7
+ from utils.embed import embed
8
+ from utils.paths import DATA
 
 
 
9
 
10
 
11
  @cache
12
  def get_model():
13
+ file_name = hf_hub_download(
14
+ "opale-ai/news-classifier", "model/model.pickle", revision="main"
15
+ )
16
  with open(file_name, "rb") as f:
17
  return pickle.load(f)
18
 
19
 
20
+ def get_record():
21
+ df = pl.read_parquet(DATA / "eval.parquet")
22
+ return {
23
+ col: val
24
+ for col, val in zip(df.columns, df.sample().row(0))
25
+ if col in ["text", "is_news"]
26
+ }
 
 
 
27
 
28
 
29
  def main():
30
  model = get_model()
31
+ record = get_record()
32
+ embeds = embed([record["text"]])
33
  (pred,) = model.predict(embeds)
34
+ print(f"is news (real): {record['is_news']}")
 
35
  print(f"is news (pred): {pred}")
36
 
37
 
utils/__init__.py ADDED
File without changes
utils/data.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ COLUMNS = [
2
+ "url",
3
+ "website",
4
+ "og_type",
5
+ "meta_description",
6
+ "meta_title",
7
+ "content",
8
+ "date",
9
+ "days_old",
10
+ "link_count",
11
+ "paragraph_count",
12
+ "average_links",
13
+ "text_to_html_ratio",
14
+ "css_title",
15
+ "is_news_article",
16
+ "reason",
17
+ ]
embed.py → utils/embed.py RENAMED
File without changes
utils/paths.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ DATA = Path(os.environ["DATA_DIR"])
5
+ DATA.mkdir(parents=True, exist_ok=True)
6
+ MODEL = Path(os.environ["MODEL_DIR"])
7
+ MODEL.mkdir(parents=True, exist_ok=True)
8
+ IMGS = Path(os.environ["IMGS_DIR"])
9
+ IMGS.mkdir(parents=True, exist_ok=True)