Upload 14 files

Browse files

upload project from https://github.com/lovemefan/CT-Transformer-punctuation

Files changed (14) hide show

LICENSE +22 -0
MANIFEST.in +3 -0
README.md +122 -3
cttPunctuator.py +64 -0
cttpunctuator/__init__.py +5 -0
cttpunctuator/src/onnx/configuration.json +20 -0
cttpunctuator/src/onnx/punc.onnx +3 -0
cttpunctuator/src/onnx/punc.yaml +0 -0
cttpunctuator/src/punctuator.py +307 -0
cttpunctuator/src/utils/OrtInferSession.py +98 -0
cttpunctuator/src/utils/text_post_process.py +86 -0
setup.py +64 -0
test/test.py +38 -0
version.txt +1 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,22 @@

+The MIT License (MIT)
+Copyright (c) 2014-2017 Alexey Popravka
+Copyright (c) 2021 Sean Stewart
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MANIFEST.in ADDED Viewed

	@@ -0,0 +1,3 @@

+include cttpunctuator/src/onnx/configuration.json
+include cttpunctuator/src/onnx/punc.onnx
+include cttpunctuator/src/onnx/punc.yaml

README.md CHANGED Viewed

@@ -1,3 +1,122 @@
----
-license: mit
----

+<br/>
+<h2 align="center">Ctt punctuator</h2>
+<br/>
+![python3.7](https://img.shields.io/badge/python-3.7-green.svg)
+![python3.8](https://img.shields.io/badge/python-3.8-green.svg)
+![python3.9](https://img.shields.io/badge/python-3.9-green.svg)
+![python3.10](https://img.shields.io/badge/python-3.10-green.svg)
+  A enterprise-grade Chinese-English code switch punctuator [funasr](https://github.com/alibaba-damo-academy/FunASR/).
+<br/>
+<h2 align="center">Key Features</h2>
+<br/>
+- **General**
+  ctt punctuator was trained on chinese-english code switch corpora.
+  - [x] offline punctuator
+  - [x] online punctuator
+  - [x] punctuator for chinese-english code switch
+  the onnx model file is 279M, you can download it from [here](https://github.com/lovemefan/CT-Transformer-punctuation/raw/main/cttpunctuator/src/onnx/punc.onnx)
+- **Highly Portable**
+  ctt-punctuator reaps benefits from the rich ecosystems built around **ONNX** running everywhere where these runtimes are available.
+## Installation
+```bash
+sudo apt install git-lfs
+# if the code raise : failed:Protobuf parsing failed.
+# you should install git-lfs and run git lfs install
+git lfs install
+# use lfs download onnx file
+git clone https://github.com/lovemefan/CT-Transformer-punctuation.git
+cd CT-Transformer-punctuation
+pip install -e .
+```
+## Usage
+```python
+from cttPunctuator import CttPunctuator
+import logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s %(levelname)s] [%(filename)s:%(lineno)d %(module)s.%(funcName)s] %(message)s",
+)
+# offline mode
+punc = CttPunctuator()
+text = "据报道纽约时报使用ChatGPT创建了一个情人节消息生成器用户只需输入几个提示就可以得到一封自动生成的情书"
+logging.info(punc.punctuate(text)[0])
+# online mode
+punc = CttPunctuator(online=True)
+text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+vads = text_in.split("|")
+rec_result_all = ""
+param_dict = {"cache": []}
+for vad in vads:
+    result = punc.punctuate(vad, param_dict=param_dict)
+    rec_result_all += result[0]
+    logging.info(f"Part: {rec_result_all}")
+logging.info(f"Final: {rec_result_all}")
+```
+## Result
+```bash
+[2023-04-19 01:12:39,308 INFO] [ctt-punctuator.py:50 ctt-punctuator.__init__] Initializing punctuator model with offline mode.
+[2023-04-19 01:12:55,854 INFO] [ctt-punctuator.py:52 ctt-punctuator.__init__] Offline model initialized.
+[2023-04-19 01:12:55,854 INFO] [ctt-punctuator.py:55 ctt-punctuator.__init__] Model initialized.
+[2023-04-19 01:12:55,868 INFO] [ctt-punctuator.py:67 ctt-punctuator.<module>] 据报道，纽约时报使用ChatGPT创建了一个情人节消息生成器，用户只需输入几个提示，就可以得到一封自动生成的情书。
+[2023-04-19 01:12:55,868 INFO] [ctt-punctuator.py:40 ctt-punctuator.__init__] Initializing punctuator model with online mode.
+[2023-04-19 01:13:12,499 INFO] [ctt-punctuator.py:43 ctt-punctuator.__init__] Online model initialized.
+[2023-04-19 01:13:12,499 INFO] [ctt-punctuator.py:55 ctt-punctuator.__init__] Model initialized.
+[2023-04-19 01:13:12,502 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸
+[2023-04-19 01:13:12,508 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员
+[2023-04-19 01:13:12,521 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险
+[2023-04-19 01:13:12,547 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切
+[2023-04-19 01:13:12,553 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制
+[2023-04-19 01:13:12,559 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是
+[2023-04-19 01:13:12,560 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是中方能做的，我们
+[2023-04-19 01:13:12,567 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是中方能做的，我们都会去做，而且会做得更好。我请印度朋友们放心，中国在上游的
+[2023-04-19 01:13:12,572 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是中方能做的，我们都会去做，而且会做得更好。我请印度朋友们放心，中国在上游的任何开发利用，都会经过科学
+[2023-04-19 01:13:12,578 INFO] [ctt-punctuator.py:77 ctt-punctuator.<module>] Partial: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是中方能做的，我们都会去做，而且会做得更好。我请印度朋友们放心，中国在上游的任何开发利用，都会经过科学规划和论证，兼顾上下游的利益
+[2023-04-19 01:13:12,578 INFO] [ctt-punctuator.py:79 ctt-punctuator.<module>] Final: 跨境河流是养育沿岸人民的生命之源。长期以来，为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难，甚至冒着生命危险，向印方提供汛期水文资料处理紧急事件。中方重视印方在跨境河流>问题上的关切，愿意进一步完善双方联合工作机制。凡是中方能做的，我们都会去做，而且会做得更好。我请印度朋友们放心，中国在上游的任何开发利用，都会经过科学规划和论证，兼顾上下游的利益
+```
+## Citation
+```
+@inproceedings{chen2020controllable,
+  title={Controllable Time-Delay Transformer for Real-Time Punctuation Prediction and Disfluency Detection},
+  author={Chen, Qian and Chen, Mengzhe and Li, Bo and Wang, Wen},
+  booktitle={ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
+  pages={8069--8073},
+  year={2020},
+  organization={IEEE}
+}
+```
+```
+@misc{FunASR,
+  author = {Speech Lab, Alibaba Group, China},
+  title = {FunASR: A Fundamental End-to-End Speech Recognition Toolkit},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/alibaba-damo-academy/FunASR/}},
+}
+```

cttPunctuator.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# -*- coding:utf-8 -*-
+# @FileName  :ctt-punctuator.py
+# @Time      :2023/4/13 15:03
+# @Author    :lovemefan
+# @Email     :[email protected]
+__author__ = "lovemefan"
+__copyright__ = "Copyright (C) 2023 lovemefan"
+__license__ = "MIT"
+__version__ = "v0.0.1"
+import logging
+import threading
+from cttpunctuator.src.punctuator import (CT_Transformer,
+                                          CT_Transformer_VadRealtime)
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s %(levelname)s] [%(filename)s:%(lineno)d %(module)s.%(funcName)s] %(message)s",
+)
+lock = threading.RLock()
+class CttPunctuator:
+    _offline_model = None
+    _online_model = None
+    def __init__(self, online: bool = False):
+        """
+        punctuator with singleton pattern
+        :param online:
+        """
+        self.online = online
+        if online:
+            if CttPunctuator._online_model is None:
+                with lock:
+                    if CttPunctuator._online_model is None:
+                        logging.info("Initializing punctuator model with online mode.")
+                        CttPunctuator._online_model = CT_Transformer_VadRealtime()
+                        self.param_dict = {"cache": []}
+                        logging.info("Online model initialized.")
+            self.model = CttPunctuator._online_model
+        else:
+            if CttPunctuator._offline_model is None:
+                with lock:
+                    if CttPunctuator._offline_model is None:
+                        logging.info("Initializing punctuator model with offline mode.")
+                        CttPunctuator._offline_model = CT_Transformer()
+                        logging.info("Offline model initialized.")
+            self.model = CttPunctuator._offline_model
+        logging.info("Model initialized.")
+    def punctuate(self, text: str, param_dict=None):
+        if self.online:
+            param_dict = param_dict or self.param_dict
+            return self.model(text, self.param_dict)
+        else:
+            return self.model(text)

cttpunctuator/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# -*- coding:utf-8 -*-
+# @FileName  :__init__.py.py
+# @Time      :2023/4/13 14:58
+# @Author    :lovemefan
+# @Email     :[email protected]

cttpunctuator/src/onnx/configuration.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "framework": "onnx",
+  "task" : "punctuation",
+  "model" : {
+    "type" : "generic-punc",
+    "punc_model_name" : "punc.pb",
+    "punc_model_config" : {
+      "type": "pytorch",
+      "code_base": "funasr",
+      "mode": "punc",
+      "lang": "zh-cn",
+      "batch_size": 1,
+      "punc_config": "punc.yaml",
+      "model": "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+    }
+  },
+  "pipeline": {
+    "type":"punc-inference"
+  }
+}

cttpunctuator/src/onnx/punc.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:06ae02f3fce2d6bfbcdd988672467808c0113e77b6eed7dc52835ff627e12330
+size 292007354

cttpunctuator/src/onnx/punc.yaml ADDED Viewed

The diff for this file is too large to render. See raw diff

cttpunctuator/src/punctuator.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import logging
+import os.path
+from pathlib import Path
+from typing import Tuple, Union
+import numpy as np
+from cttpunctuator.src.utils.OrtInferSession import (ONNXRuntimeError,
+                                                     OrtInferSession)
+from cttpunctuator.src.utils.text_post_process import (TokenIDConverter,
+                                                       code_mix_split_words,
+                                                       read_yaml,
+                                                       split_to_mini_sentence)
+class CT_Transformer:
+    """
+    Author: Speech Lab, Alibaba Group, China
+    CT-Transformer: Controllable time-delay transformer
+    for real-time punctuation prediction and disfluency detection
+    https://arxiv.org/pdf/2003.01309.pdf
+    """
+    def __init__(
+        self,
+        model_dir: Union[str, Path] = None,
+        batch_size: int = 1,
+        device_id: Union[str, int] = "-1",
+        quantize: bool = False,
+        intra_op_num_threads: int = 4,
+    ):
+        model_dir = model_dir or os.path.join(os.path.dirname(__file__), "onnx")
+        if model_dir is None or not Path(model_dir).exists():
+            raise FileNotFoundError(f"{model_dir} does not exist.")
+        model_file = os.path.join(model_dir, "punc.onnx")
+        if quantize:
+            model_file = os.path.join(model_dir, "model_quant.onnx")
+        config_file = os.path.join(model_dir, "punc.yaml")
+        config = read_yaml(config_file)
+        self.converter = TokenIDConverter(config["token_list"])
+        self.ort_infer = OrtInferSession(
+            model_file, device_id, intra_op_num_threads=intra_op_num_threads
+        )
+        self.batch_size = 1
+        self.punc_list = config["punc_list"]
+        self.period = 0
+        for i in range(len(self.punc_list)):
+            if self.punc_list[i] == ",":
+                self.punc_list[i] = "，"
+            elif self.punc_list[i] == "?":
+                self.punc_list[i] = "？"
+            elif self.punc_list[i] == "。":
+                self.period = i
+    def __call__(self, text: Union[list, str], split_size=20):
+        split_text = code_mix_split_words(text)
+        split_text_id = self.converter.tokens2ids(split_text)
+        mini_sentences = split_to_mini_sentence(split_text, split_size)
+        mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
+        assert len(mini_sentences) == len(mini_sentences_id)
+        cache_sent = []
+        cache_sent_id = []
+        new_mini_sentence = ""
+        new_mini_sentence_punc = []
+        cache_pop_trigger_limit = 200
+        for mini_sentence_i in range(len(mini_sentences)):
+            mini_sentence = mini_sentences[mini_sentence_i]
+            mini_sentence_id = mini_sentences_id[mini_sentence_i]
+            mini_sentence = cache_sent + mini_sentence
+            mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype="int64")
+            data = {
+                "text": mini_sentence_id[None, :],
+                "text_lengths": np.array([len(mini_sentence_id)], dtype="int32"),
+            }
+            try:
+                outputs = self.infer(data["text"], data["text_lengths"])
+                y = outputs[0]
+                punctuations = np.argmax(y, axis=-1)[0]
+                assert punctuations.size == len(mini_sentence)
+            except ONNXRuntimeError:
+                logging.warning("error")
+            # Search for the last Period/QuestionMark as cache
+            if mini_sentence_i < len(mini_sentences) - 1:
+                sentenceEnd = -1
+                last_comma_index = -1
+                for i in range(len(punctuations) - 2, 1, -1):
+                    if (
+                        self.punc_list[punctuations[i]] == "。"
+                        or self.punc_list[punctuations[i]] == "？"
+                    ):
+                        sentenceEnd = i
+                        break
+                    if last_comma_index < 0 and self.punc_list[punctuations[i]] == "，":
+                        last_comma_index = i
+                if (
+                    sentenceEnd < 0
+                    and len(mini_sentence) > cache_pop_trigger_limit
+                    and last_comma_index >= 0
+                ):
+                    # The sentence it too long, cut off at a comma.
+                    sentenceEnd = last_comma_index
+                    punctuations[sentenceEnd] = self.period
+                cache_sent = mini_sentence[sentenceEnd + 1 :]
+                cache_sent_id = mini_sentence_id[sentenceEnd + 1 :].tolist()
+                mini_sentence = mini_sentence[0 : sentenceEnd + 1]
+                punctuations = punctuations[0 : sentenceEnd + 1]
+            new_mini_sentence_punc += [int(x) for x in punctuations]
+            words_with_punc = []
+            for i in range(len(mini_sentence)):
+                if i > 0:
+                    if (
+                        len(mini_sentence[i][0].encode()) == 1
+                        and len(mini_sentence[i - 1][0].encode()) == 1
+                    ):
+                        mini_sentence[i] = " " + mini_sentence[i]
+                words_with_punc.append(mini_sentence[i])
+                if self.punc_list[punctuations[i]] != "_":
+                    words_with_punc.append(self.punc_list[punctuations[i]])
+            new_mini_sentence += "".join(words_with_punc)
+            # Add Period for the end of the sentence
+            new_mini_sentence_out = new_mini_sentence
+            new_mini_sentence_punc_out = new_mini_sentence_punc
+            if mini_sentence_i == len(mini_sentences) - 1:
+                if new_mini_sentence[-1] == "，" or new_mini_sentence[-1] == "、":
+                    new_mini_sentence_out = new_mini_sentence[:-1] + "。"
+                    new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [
+                        self.period
+                    ]
+                elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "？":
+                    new_mini_sentence_out = new_mini_sentence + "。"
+                    new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [
+                        self.period
+                    ]
+        return new_mini_sentence_out, new_mini_sentence_punc_out
+    def infer(
+        self, feats: np.ndarray, feats_len: np.ndarray
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        outputs = self.ort_infer([feats, feats_len])
+        return outputs
+class CT_Transformer_VadRealtime(CT_Transformer):
+    """
+    Author: Speech Lab, Alibaba Group, China
+    CT-Transformer: Controllable time-delay transformer for
+    real-time punctuation prediction and disfluency detection
+    https://arxiv.org/pdf/2003.01309.pdf
+    """
+    def __init__(
+        self,
+        model_dir: Union[str, Path] = None,
+        batch_size: int = 1,
+        device_id: Union[str, int] = "-1",
+        quantize: bool = False,
+        intra_op_num_threads: int = 4,
+    ):
+        super(CT_Transformer_VadRealtime, self).__init__(
+            model_dir, batch_size, device_id, quantize, intra_op_num_threads
+        )
+    def __call__(self, text: str, param_dict: map, split_size=20):
+        cache_key = "cache"
+        assert cache_key in param_dict
+        cache = param_dict[cache_key]
+        if cache is not None and len(cache) > 0:
+            precache = "".join(cache)
+        else:
+            precache = ""
+            cache = []
+        full_text = precache + text
+        split_text = code_mix_split_words(full_text)
+        split_text_id = self.converter.tokens2ids(split_text)
+        mini_sentences = split_to_mini_sentence(split_text, split_size)
+        mini_sentences_id = split_to_mini_sentence(split_text_id, split_size)
+        new_mini_sentence_punc = []
+        assert len(mini_sentences) == len(mini_sentences_id)
+        cache_sent = []
+        cache_sent_id = np.array([], dtype="int32")
+        sentence_punc_list = []
+        sentence_words_list = []
+        cache_pop_trigger_limit = 200
+        skip_num = 0
+        for mini_sentence_i in range(len(mini_sentences)):
+            mini_sentence = mini_sentences[mini_sentence_i]
+            mini_sentence_id = mini_sentences_id[mini_sentence_i]
+            mini_sentence = cache_sent + mini_sentence
+            mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
+            text_length = len(mini_sentence_id)
+            data = {
+                "input": mini_sentence_id[None, :],
+                "text_lengths": np.array([text_length], dtype="int32"),
+                "vad_mask": self.vad_mask(text_length, len(cache))[
+                    None, None, :, :
+                ].astype(np.float32),
+                "sub_masks": np.tril(
+                    np.ones((text_length, text_length), dtype=np.float32)
+                )[None, None, :, :].astype(np.float32),
+            }
+            try:
+                outputs = self.infer(
+                    data["input"],
+                    data["text_lengths"],
+                    data["vad_mask"],
+                    data["sub_masks"],
+                )
+                y = outputs[0]
+                punctuations = np.argmax(y, axis=-1)[0]
+                assert punctuations.size == len(mini_sentence)
+            except ONNXRuntimeError:
+                logging.warning("error")
+            # Search for the last Period/QuestionMark as cache
+            if mini_sentence_i < len(mini_sentences) - 1:
+                sentenceEnd = -1
+                last_comma_index = -1
+                for i in range(len(punctuations) - 2, 1, -1):
+                    if (
+                        self.punc_list[punctuations[i]] == "。"
+                        or self.punc_list[punctuations[i]] == "？"
+                    ):
+                        sentenceEnd = i
+                        break
+                    if last_comma_index < 0 and self.punc_list[punctuations[i]] == "，":
+                        last_comma_index = i
+                if (
+                    sentenceEnd < 0
+                    and len(mini_sentence) > cache_pop_trigger_limit
+                    and last_comma_index >= 0
+                ):
+                    # The sentence it too long, cut off at a comma.
+                    sentenceEnd = last_comma_index
+                    punctuations[sentenceEnd] = self.period
+                cache_sent = mini_sentence[sentenceEnd + 1 :]
+                cache_sent_id = mini_sentence_id[sentenceEnd + 1 :]
+                mini_sentence = mini_sentence[0 : sentenceEnd + 1]
+                punctuations = punctuations[0 : sentenceEnd + 1]
+            punctuations_np = [int(x) for x in punctuations]
+            new_mini_sentence_punc += punctuations_np
+            sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
+            sentence_words_list += mini_sentence
+        assert len(sentence_punc_list) == len(sentence_words_list)
+        words_with_punc = []
+        sentence_punc_list_out = []
+        for i in range(0, len(sentence_words_list)):
+            if i > 0:
+                if (
+                    len(sentence_words_list[i][0].encode()) == 1
+                    and len(sentence_words_list[i - 1][-1].encode()) == 1
+                ):
+                    sentence_words_list[i] = " " + sentence_words_list[i]
+            if skip_num < len(cache):
+                skip_num += 1
+            else:
+                words_with_punc.append(sentence_words_list[i])
+            if skip_num >= len(cache):
+                sentence_punc_list_out.append(sentence_punc_list[i])
+                if sentence_punc_list[i] != "_":
+                    words_with_punc.append(sentence_punc_list[i])
+        sentence_out = "".join(words_with_punc)
+        sentenceEnd = -1
+        for i in range(len(sentence_punc_list) - 2, 1, -1):
+            if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "？":
+                sentenceEnd = i
+                break
+        cache_out = sentence_words_list[sentenceEnd + 1 :]
+        if sentence_out[-1] in self.punc_list:
+            sentence_out = sentence_out[:-1]
+            sentence_punc_list_out[-1] = "_"
+        param_dict[cache_key] = cache_out
+        return sentence_out, sentence_punc_list_out, cache_out
+    def vad_mask(self, size, vad_pos, dtype=np.bool_):
+        """Create mask for decoder self-attention.
+        :param int size: size of mask
+        :param int vad_pos: index of vad index
+        :param torch.dtype dtype: result dtype
+        :rtype: torch.Tensor (B, Lmax, Lmax)
+        """
+        ret = np.ones((size, size), dtype=dtype)
+        if vad_pos <= 0 or vad_pos >= size:
+            return ret
+        sub_corner = np.zeros((vad_pos - 1, size - vad_pos), dtype=dtype)
+        ret[0 : vad_pos - 1, vad_pos:] = sub_corner
+        return ret
+    def infer(
+        self,
+        feats: np.ndarray,
+        feats_len: np.ndarray,
+        vad_mask: np.ndarray,
+        sub_masks: np.ndarray,
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        outputs = self.ort_infer([feats, feats_len, vad_mask, sub_masks])
+        return outputs

cttpunctuator/src/utils/OrtInferSession.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# -*- coding:utf-8 -*-
+# @FileName  :OrtInferSession.py
+# @Time      :2023/4/13 15:13
+# @Author    :lovemefan
+# @Email     :[email protected]
+import logging
+from pathlib import Path
+from typing import List, Union
+import numpy as np
+from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+                         SessionOptions, get_available_providers, get_device)
+class ONNXRuntimeError(Exception):
+    pass
+class OrtInferSession:
+    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
+        device_id = str(device_id)
+        sess_opt = SessionOptions()
+        sess_opt.intra_op_num_threads = intra_op_num_threads
+        sess_opt.log_severity_level = 4
+        sess_opt.enable_cpu_mem_arena = False
+        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+        cuda_ep = "CUDAExecutionProvider"
+        cuda_provider_options = {
+            "device_id": device_id,
+            "arena_extend_strategy": "kNextPowerOfTwo",
+            "cudnn_conv_algo_search": "EXHAUSTIVE",
+            "do_copy_in_default_stream": "true",
+        }
+        cpu_ep = "CPUExecutionProvider"
+        cpu_provider_options = {
+            "arena_extend_strategy": "kSameAsRequested",
+        }
+        EP_list = []
+        if (
+            device_id != "-1"
+            and get_device() == "GPU"
+            and cuda_ep in get_available_providers()
+        ):
+            EP_list = [(cuda_ep, cuda_provider_options)]
+        EP_list.append((cpu_ep, cpu_provider_options))
+        self._verify_model(model_file)
+        self.session = InferenceSession(
+            model_file, sess_options=sess_opt, providers=EP_list
+        )
+        if device_id != "-1" and cuda_ep not in self.session.get_providers():
+            logging.warnings.warn(
+                f"{cuda_ep} is not avaiable for current env, "
+                f"the inference part is automatically shifted to be executed under {cpu_ep}.\n"
+                "Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, "
+                "you can check their relations from the offical web site: "
+                "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html",
+                RuntimeWarning,
+            )
+    def __call__(
+        self, input_content: List[Union[np.ndarray, np.ndarray]]
+    ) -> np.ndarray:
+        input_dict = dict(zip(self.get_input_names(), input_content))
+        try:
+            return self.session.run(self.get_output_names(), input_dict)
+        except Exception as e:
+            raise ONNXRuntimeError("ONNXRuntime inferece failed.") from e
+    def get_input_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_inputs()]
+    def get_output_names(
+        self,
+    ):
+        return [v.name for v in self.session.get_outputs()]
+    def get_character_list(self, key: str = "character"):
+        return self.meta_dict[key].splitlines()
+    def have_key(self, key: str = "character") -> bool:
+        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+        if key in self.meta_dict.keys():
+            return True
+        return False
+    @staticmethod
+    def _verify_model(model_path):
+        model_path = Path(model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f"{model_path} does not exists.")
+        if not model_path.is_file():
+            raise FileExistsError(f"{model_path} is not a file.")

cttpunctuator/src/utils/text_post_process.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# -*- coding:utf-8 -*-
+# @FileName  :text_post_process.py
+# @Time      :2023/4/13 15:09
+# @Author    :lovemefan
+# @Email     :[email protected]
+from pathlib import Path
+from typing import Dict, Iterable, List, Union
+import numpy as np
+import yaml
+from typeguard import check_argument_types
+class TokenIDConverterError(Exception):
+    pass
+class TokenIDConverter:
+    def __init__(
+        self,
+        token_list: Union[List, str],
+    ):
+        check_argument_types()
+        self.token_list = token_list
+        self.unk_symbol = token_list[-1]
+        self.token2id = {v: i for i, v in enumerate(self.token_list)}
+        self.unk_id = self.token2id[self.unk_symbol]
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+    def ids2tokens(self, integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise TokenIDConverterError(
+                f"Must be 1 dim ndarray, but got {integers.ndim}"
+            )
+        return [self.token_list[i] for i in integers]
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        return [self.token2id.get(i, self.unk_id) for i in tokens]
+def split_to_mini_sentence(words: list, word_limit: int = 20):
+    assert word_limit > 1
+    if len(words) <= word_limit:
+        return [words]
+    sentences = []
+    length = len(words)
+    sentence_len = length // word_limit
+    for i in range(sentence_len):
+        sentences.append(words[i * word_limit : (i + 1) * word_limit])
+    if length % word_limit > 0:
+        sentences.append(words[sentence_len * word_limit :])
+    return sentences
+def code_mix_split_words(text: str):
+    words = []
+    segs = text.split()
+    for seg in segs:
+        # There is no space in seg.
+        current_word = ""
+        for c in seg:
+            if len(c.encode()) == 1:
+                # This is an ASCII char.
+                current_word += c
+            else:
+                # This is a Chinese char.
+                if len(current_word) > 0:
+                    words.append(current_word)
+                    current_word = ""
+                words.append(c)
+        if len(current_word) > 0:
+            words.append(current_word)
+    return words
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+    if not Path(yaml_path).exists():
+        raise FileExistsError(f"The {yaml_path} does not exist.")
+    with open(str(yaml_path), "rb") as f:
+        data = yaml.load(f, Loader=yaml.Loader)
+    return data

setup.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# -*- coding:utf-8 -*-
+# @FileName  :setup.py
+# @Time      :2023/4/4 11:22
+# @Author    :lovemefan
+# @Email     :[email protected]
+import os
+from pathlib import Path
+from setuptools import find_namespace_packages, setup
+dirname = Path(os.path.dirname(__file__))
+version_file = dirname / "version.txt"
+with open(version_file, "r") as f:
+    version = f.read().strip()
+requirements = {
+    "install": [
+        "setuptools<=65.0",
+        "PyYAML",
+        "typeguard==2.13.3",
+        "onnxruntime==1.14.1",
+    ],
+    "setup": [
+        "numpy==1.24.2",
+    ],
+    "all": [],
+}
+requirements["all"].extend(requirements["install"])
+install_requires = requirements["install"]
+setup_requires = requirements["setup"]
+setup(
+    name="cttpunctuator",
+    version=version,
+    url="https://github.com/lovemefan/CT-Transformer-punctuation",
+    author="Lovemefan, Yunnan Key Laboratory of Artificial Intelligence, "
+    "Kunming University of Science and Technology, Kunming, Yunnan ",
+    author_email="[email protected]",
+    description="ctt-punctuator: A enterprise-grade punctuator after chinese asr based "
+    "on ct-transformer from funasr opensource",
+    long_description=open(os.path.join(dirname, "README.md"), encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    license="The MIT License",
+    packages=find_namespace_packages(),
+    include_package_data=True,
+    install_requires=install_requires,
+    python_requires=">=3.7.0",
+    classifiers=[
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Science/Research",
+        "Operating System :: POSIX :: Linux",
+        "License :: OSI Approved :: Apache Software License",
+        "Topic :: Multimedia :: Sound/Audio :: Speech",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Software Development :: Libraries :: Python Modules",
+    ],
+)

test/test.py ADDED Viewed

	@@ -0,0 +1,38 @@

+# -*- coding:utf-8 -*-
+# @FileName  :test.py.py
+# @Time      :2023/4/19 13:39
+# @Author    :lovemefan
+# @Email     :[email protected]
+import logging
+from cttPunctuator import CttPunctuator
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s %(levelname)s] [%(filename)s:%(lineno)d %(module)s.%(funcName)s] %(message)s",
+)
+# offline mode
+punc = CttPunctuator()
+text = "据报道纽约时报使用ChatGPT创建了一个情人节消息生成器用户只需输入几个提示就可以得到一封自动生成的情书"
+logging.info(punc.punctuate(text)[0])
+# online mode
+punc = CttPunctuator(online=True)
+text_in = (
+    "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|"
+    "在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|"
+    "向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|"
+    "愿意进一步完善双方联合工作机制|凡是|中方能做的我们|"
+    "都会去做而且会做得更好我请印度朋友们放心中国在上游的|"
+    "任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+)
+vads = text_in.split("|")
+rec_result_all = ""
+for vad in vads:
+    result = punc.punctuate(vad)
+    rec_result_all += result[0]
+    logging.info(f"Part: {rec_result_all}")
+logging.info(f"Final: {rec_result_all}")

version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.0.1