Yiming Qian commited on
Commit
8cb44d1
·
verified ·
1 Parent(s): e0d1d85

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +0 -120
README.md CHANGED
@@ -9,123 +9,3 @@ It is a model based on quantized LLAMA 3 8B. The goal of this model is designed
9
 
10
  Please use the following code to parse PDF.
11
 
12
- Our code requires to import the following libraries
13
- '''
14
- import pymupdf
15
- from bs4 import BeautifulSoup
16
- import pickle
17
- import torch
18
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
19
- '''
20
-
21
- model_kwargs = dict(
22
- use_cache=False,
23
- trust_remote_code=True,
24
- attn_implementation="flash_attention_2",
25
- torch_dtype=torch.bfloat16,
26
- device_map="cuda",
27
- load_in_4bit=True
28
- )
29
-
30
- model = AutoModelForCausalLM.from_pretrained("./model_4bit", **model_kwargs)
31
-
32
- tokenizer = AutoTokenizer.from_pretrained("./model_4bit")
33
- tokenizer.model_max_length = 8000
34
- tokenizer.pad_token = tokenizer.eos_token # use unk rather than eos token to prevent endless generation
35
- tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
36
- tokenizer.padding_side = 'right'
37
-
38
-
39
- SYSTEM = '''The user's input is data in XML format. Please organize it into a markdown format. Pay attention to:
40
-
41
- 1. Directly output the results. Do not make summary of the text.
42
- 2. Do not alter any text from the XML. Do not change number into words.
43
- 3. Correct format errors, such as misalignment between numbers and text, and disorder in the sequence of table cells.
44
- 4. Use markdown, but all numbers must be explicitly written out in full (e.g., 3.2.5.1).
45
- 5. Preserve the original document structure as much as possible, such as paragraphs, lists, etc.
46
- 6. Pay attention to detecting tables in the text (as the table format may have been lost due to copying from the XML). Restore the table's format and maintain its integrity. Some tables may be too long and span across pages. Pay attention to merging the same tables that span pages. Properly handle table headers to avoid repetition or omission.
47
- 7. Text from the XML may contain some garbled characters; remove any characters that are garbled.
48
- 8. Convert headings (H1, H2, H3, etc.) into their respective Markdown heading levels (e.g., 3 for # 3, 3.2 for ## 3.2, 3.2.1 for ### 3.2.1).
49
- 9. Include metadata information in the output, such as document title, section number, etc.
50
- 10. Remove the footnote and page number, it is important!!!
51
- 11. Make sure phrase connected with - will not break up.
52
- '''
53
-
54
- def merge_elements_up_to_max_length(elements, max_length):
55
- """
56
- Merge elements in the list to ensure no element exceeds the specified max_length.
57
-
58
- Parameters:
59
- - elements: List[str] - The list of string elements to merge.
60
- - max_length: int - The maximum allowed length for any element after merging.
61
-
62
- Returns:
63
- - List[str]: A new list where the elements have been merged as necessary.
64
- """
65
- if not elements:
66
- return []
67
-
68
- # Initialize the list with the first element
69
- merged = [elements[0]]
70
-
71
- for element in elements[1:]:
72
- # Check if the last element in merged list can be combined with the current element
73
- if len(merged[-1]) + len(element) <= max_length:
74
- merged[-1] += element # Merge with the last element
75
- else:
76
- merged.append(element) # Add as a new element
77
-
78
- return merged
79
-
80
-
81
- pipe = pipeline(
82
- "text-generation",
83
- model=model,
84
- tokenizer=tokenizer,
85
- )
86
-
87
- generation_args = {
88
- "max_new_tokens": 2000,
89
- "return_full_text": False,
90
- "do_sample": False,
91
- }
92
-
93
-
94
- filename ='2023071000529.pdf'
95
-
96
- elements=[]
97
-
98
- with pymupdf.open(filename) as doc:
99
-
100
- for page in doc:
101
- soup = BeautifulSoup(page.get_text('xhtml'), 'html.parser')
102
- for img in soup("img"):
103
- img.decompose()
104
-
105
- page_element=''
106
- for item in soup.find_all('p'):
107
- if len(item.get_text())<2:
108
- item.decompose()
109
- else:
110
- #elements.append(str(item))
111
- page_element=page_element+str(item)
112
- elements.append(page_element)
113
- elements.append("<hr>")
114
-
115
- max_length=7000
116
-
117
- merged_elements=merge_elements_up_to_max_length(elements, max_length)
118
-
119
- markdown_text=''
120
- for j in range(len(merged_elements)):
121
- item =merged_elements[j]
122
- messages=[{"role": "system", "content": SYSTEM},
123
- {"role": "user", "content": item}]
124
- output = pipe(messages, **generation_args)
125
- markdown_text=markdown_text+output[0]['generated_text']+'\n'
126
-
127
- main_file = filename[:-4]+'.md'
128
- with open(main_file, "w") as f:
129
- f.write(markdown_text)
130
-
131
-
 
9
 
10
  Please use the following code to parse PDF.
11