Upload folder using huggingface_hub
Browse files- README.md +10 -24
- configuration_pointsv15_chat.py +0 -6
- preprocessor_config.json +4 -2
README.md
CHANGED
@@ -1,17 +1,3 @@
|
|
1 |
-
---
|
2 |
-
datasets:
|
3 |
-
- HuggingFaceM4/Docmatix
|
4 |
-
- opendatalab/OmniDocBench
|
5 |
-
language:
|
6 |
-
- zh
|
7 |
-
- en
|
8 |
-
base_model:
|
9 |
-
- Qwen/Qwen2.5-3B-Instruct
|
10 |
-
- WePOINTS/POINTS-Qwen-2-5-7B-Chat
|
11 |
-
tags:
|
12 |
-
- vision-language
|
13 |
-
- document-parsing
|
14 |
-
---
|
15 |
<p align="center">
|
16 |
<img src="images/logo.png" width="700"/>
|
17 |
<p>
|
@@ -51,7 +37,7 @@ We are delighted to announce that the WePOINTS family has welcomed a new member:
|
|
51 |
|
52 |
## Results
|
53 |
|
54 |
-
|
55 |
|
56 |
<table style="width: 92%; margin: auto; border-collapse: collapse;">
|
57 |
<thead>
|
@@ -239,8 +225,8 @@ For comparison, we use the results reported by [OmniDocBench](https://github.com
|
|
239 |
<td>0.641</td>
|
240 |
</tr>
|
241 |
<tr>
|
242 |
-
<td rowspan="
|
243 |
-
<td
|
244 |
<td>0.133</td>
|
245 |
<td>0.212</td>
|
246 |
<td>0.062</td>
|
@@ -607,9 +593,9 @@ prompt = (
|
|
607 |
image_path = '/path/to/your/local/image'
|
608 |
model_path = 'tencent/POINTS-Reader'
|
609 |
model = AutoModelForCausalLM.from_pretrained(model_path,
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
614 |
image_processor = Qwen2ImageProcessorForPOINTSV15.from_pretrained(model_path)
|
615 |
content = [
|
@@ -647,8 +633,8 @@ We will create a Pull Request to SGLang, please stay tuned.
|
|
647 |
|
648 |
## Known Issues
|
649 |
|
650 |
-
- **Complex Document Parsing**: POINTS-Reader can struggle with complex layouts (e.g., newspapers), often producing repeated or missing content.
|
651 |
-
- **Handwritten Document Parsing**: It also has difficulty handling handwritten inputs (e.g., receipts, notes), which can lead to recognition errors or omissions.
|
652 |
- **Multi-language Document Parsing**: POINTS-Reader currently supports only English and Chinese, limiting its effectiveness on other languages.
|
653 |
|
654 |
## Citation
|
@@ -659,7 +645,7 @@ If you use this model in your work, please cite the following paper:
|
|
659 |
@article{points-reader,
|
660 |
title={POINTS-Reader: Distillation-Free Adaptation of Vision-Language Models for Document Conversion},
|
661 |
author={Liu, Yuan and Zhongyin Zhao and Tian, Le and Haicheng Wang and Xubing Ye and Yangxiu You and Zilin Yu and Chuhan Wu and Zhou, Xiao and Yu, Yang and Zhou, Jie},
|
662 |
-
journal={
|
663 |
year={2025}
|
664 |
}
|
665 |
|
@@ -683,4 +669,4 @@ If you use this model in your work, please cite the following paper:
|
|
683 |
journal={arXiv preprint arXiv:2405.11850},
|
684 |
year={2024}
|
685 |
}
|
686 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
<p align="center">
|
2 |
<img src="images/logo.png" width="700"/>
|
3 |
<p>
|
|
|
37 |
|
38 |
## Results
|
39 |
|
40 |
+
We take the following results from [OmniDocBench](https://github.com/opendatalab/OmniDocBench/tree/main) and POINTS-Reader for comparison:
|
41 |
|
42 |
<table style="width: 92%; margin: auto; border-collapse: collapse;">
|
43 |
<thead>
|
|
|
225 |
<td>0.641</td>
|
226 |
</tr>
|
227 |
<tr>
|
228 |
+
<td rowspan="10">Expert VLMs</td>
|
229 |
+
<td>POINTS-Reader-3B</td>
|
230 |
<td>0.133</td>
|
231 |
<td>0.212</td>
|
232 |
<td>0.062</td>
|
|
|
593 |
image_path = '/path/to/your/local/image'
|
594 |
model_path = 'tencent/POINTS-Reader'
|
595 |
model = AutoModelForCausalLM.from_pretrained(model_path,
|
596 |
+
trust_remote_code=True,
|
597 |
+
torch_dtype=torch.float16,
|
598 |
+
device_map='cuda')
|
599 |
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
600 |
image_processor = Qwen2ImageProcessorForPOINTSV15.from_pretrained(model_path)
|
601 |
content = [
|
|
|
633 |
|
634 |
## Known Issues
|
635 |
|
636 |
+
- **Complex Document Parsing**: POINTS-Reader can struggle with complex layouts (e.g., newspapers), often producing repeated or missing content.
|
637 |
+
- **Handwritten Document Parsing**: It also has difficulty handling handwritten inputs (e.g., receipts, notes), which can lead to recognition errors or omissions.
|
638 |
- **Multi-language Document Parsing**: POINTS-Reader currently supports only English and Chinese, limiting its effectiveness on other languages.
|
639 |
|
640 |
## Citation
|
|
|
645 |
@article{points-reader,
|
646 |
title={POINTS-Reader: Distillation-Free Adaptation of Vision-Language Models for Document Conversion},
|
647 |
author={Liu, Yuan and Zhongyin Zhao and Tian, Le and Haicheng Wang and Xubing Ye and Yangxiu You and Zilin Yu and Chuhan Wu and Zhou, Xiao and Yu, Yang and Zhou, Jie},
|
648 |
+
journal={},
|
649 |
year={2025}
|
650 |
}
|
651 |
|
|
|
669 |
journal={arXiv preprint arXiv:2405.11850},
|
670 |
year={2024}
|
671 |
}
|
672 |
+
```
|
configuration_pointsv15_chat.py
CHANGED
@@ -27,9 +27,3 @@ class POINTSV15ChatConfig(PretrainedConfig):
|
|
27 |
self.llm_config = Qwen2Config(**llm_config)
|
28 |
else:
|
29 |
self.llm_config = llm_config
|
30 |
-
|
31 |
-
def to_dict(self) -> Dict[str, Any]:
|
32 |
-
output = copy.deepcopy(self.__dict__)
|
33 |
-
output["vision_config"] = self.vision_config.to_dict()
|
34 |
-
output["llm_config"] = self.llm_config.to_dict()
|
35 |
-
return output
|
|
|
27 |
self.llm_config = Qwen2Config(**llm_config)
|
28 |
else:
|
29 |
self.llm_config = llm_config
|
|
|
|
|
|
|
|
|
|
|
|
preprocessor_config.json
CHANGED
@@ -22,8 +22,10 @@
|
|
22 |
"rescale_factor": 0.00392156862745098,
|
23 |
"size": {
|
24 |
"max_pixels": 12845056,
|
25 |
-
"min_pixels": 3136
|
|
|
|
|
26 |
},
|
27 |
"temporal_patch_size": 2,
|
28 |
"processor_class": "Qwen2VLProcessor"
|
29 |
-
}
|
|
|
22 |
"rescale_factor": 0.00392156862745098,
|
23 |
"size": {
|
24 |
"max_pixels": 12845056,
|
25 |
+
"min_pixels": 3136,
|
26 |
+
"longest_edge": 12845056,
|
27 |
+
"shortest_edge": 3136
|
28 |
},
|
29 |
"temporal_patch_size": 2,
|
30 |
"processor_class": "Qwen2VLProcessor"
|
31 |
+
}
|