Spaces:

unstructuredio
/

unstructured-chipper-app

Running

ajimeno commited on Oct 4, 2023

Commit

5282d5b

1 Parent(s): 43aafa0

Corrected table tokens and move to previous chipper

Files changed (2) hide show

app.py CHANGED Viewed

@@ -81,7 +81,7 @@ else:
 st.image(image, caption='Your target document')
 with st.spinner(f'Processing the document ...'):
-        pre_trained_model = "unstructuredio/chipper-fast-fine-tuning-v2"
         processor = DonutProcessor.from_pretrained(pre_trained_model, token=os.environ['HF_TOKEN'])
         device = "cuda" if torch.cuda.is_available() else "cpu"

 st.image(image, caption='Your target document')
 with st.spinner(f'Processing the document ...'):
+        pre_trained_model = "unstructuredio/chipper-fast-fine-tuning"
         processor = DonutProcessor.from_pretrained(pre_trained_model, token=os.environ['HF_TOKEN'])
         device = "cuda" if torch.cuda.is_available() else "cpu"

logits_ngrams.py CHANGED Viewed

@@ -59,5 +59,5 @@ def _calc_banned_tokens(prev_input_ids, num_hypos, no_repeat_ngram_size, cur_len
 def get_table_token_ids(processor):
-    skip_tokens = {token_id for token, token_id in processor.tokenizer.get_added_vocab().items() if re.search('<t.*>', token)}


59
60
61	def get_table_token_ids(processor):
62	+ skip_tokens = {token_id for token, token_id in processor.tokenizer.get_added_vocab().items() if token.startswith("<t") or token.startswith("</t") }
63