Omartificial-Intelligence-Space commited on
Commit
802e961
·
verified ·
1 Parent(s): 1683c72

update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -40
app.py CHANGED
@@ -4,9 +4,10 @@ from wikipediaapi import Wikipedia
4
  import textwrap
5
  import numpy as np
6
  from openai import OpenAI
 
7
 
8
  # Function to process the input and generate the output
9
- def process_query(wiki_page, model_name, embed_dim, query, api_key):
10
  model_mapping = {
11
  "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
12
  "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
@@ -15,70 +16,81 @@ def process_query(wiki_page, model_name, embed_dim, query, api_key):
15
  "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka"
16
  }
17
 
18
- model_path = model_mapping[model_name]
19
- model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim)
20
  wiki = Wikipedia('RAGBot/0.0', 'ar')
21
  doc = wiki.page(wiki_page).text
22
  paragraphs = doc.split('\n\n') # chunking
23
-
24
  for i, p in enumerate(paragraphs):
25
  wrapped_text = textwrap.fill(p, width=100)
26
 
27
- docs_embed = model.encode(paragraphs, normalize_embeddings=True)
28
- query_embed = model.encode(query, normalize_embeddings=True)
29
- similarities = np.dot(docs_embed, query_embed.T)
30
- top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
31
- most_similar_documents = [paragraphs[idx] for idx in top_3_idx]
32
 
33
- CONTEXT = ""
34
- for i, p in enumerate(most_similar_documents):
35
- wrapped_text = textwrap.fill(p, width=100)
36
- CONTEXT += wrapped_text + "\n\n"
 
 
 
37
 
38
- prompt = f"""
39
- use the following CONTEXT to answer the QUESTION at the end.
40
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
 
41
 
42
- CONTEXT: {CONTEXT}
43
- QUESTION: {query}
44
- """
 
 
 
45
 
46
- client = OpenAI(api_key=api_key)
47
- response = client.chat.completions.create(
48
- model="gpt-4o",
49
- messages=[
50
- {"role": "user", "content": prompt},
51
- ]
52
- )
 
 
53
 
54
- return response.choices[0].message.content
 
 
 
 
 
 
 
55
 
56
  # Define the interface
57
  wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)")
58
  query_input = gr.Textbox(label="Query (in Arabic)")
59
- api_key_input = gr.Textbox(label="OpenAI API Key", type="password")
60
-
61
- model_choice = gr.Dropdown(
62
- choices=[
63
- "Arabic-mpnet-base-all-nli-triplet",
64
- "Arabic-all-nli-triplet-Matryoshka",
65
- "Arabert-all-nli-triplet-Matryoshka",
66
- "Arabic-labse-Matryoshka",
67
- "Marbert-all-nli-triplet-Matryoshka"
68
- ],
69
- label="Choose Embedding Model"
70
- )
71
 
72
  embed_dim_choice = gr.Dropdown(
73
  choices=[768, 512, 256, 128, 64],
74
  label="Embedding Dimension"
75
  )
76
 
 
 
 
 
 
77
  output_text = gr.Textbox(label="Output")
78
 
 
 
 
 
 
 
 
 
79
  gr.Interface(
80
  fn=process_query,
81
- inputs=[wiki_page_input, model_choice, embed_dim_choice, query_input, api_key_input],
82
  outputs=output_text,
83
  title="Arabic Wiki RAG",
84
  description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic."
 
4
  import textwrap
5
  import numpy as np
6
  from openai import OpenAI
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
 
9
  # Function to process the input and generate the output
10
+ def process_query(wiki_page, embed_dim, query, api_key=None, mode="OpenAI"):
11
  model_mapping = {
12
  "Arabic-mpnet-base-all-nli-triplet": "Omartificial-Intelligence-Space/Arabic-mpnet-base-all-nli-triplet",
13
  "Arabic-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Arabic-all-nli-triplet-Matryoshka",
 
16
  "Marbert-all-nli-triplet-Matryoshka": "Omartificial-Intelligence-Space/Marbert-all-nli-triplet-Matryoshka"
17
  }
18
 
 
 
19
  wiki = Wikipedia('RAGBot/0.0', 'ar')
20
  doc = wiki.page(wiki_page).text
21
  paragraphs = doc.split('\n\n') # chunking
 
22
  for i, p in enumerate(paragraphs):
23
  wrapped_text = textwrap.fill(p, width=100)
24
 
25
+ responses = {}
 
 
 
 
26
 
27
+ for model_name, model_path in model_mapping.items():
28
+ model = SentenceTransformer(model_path, trust_remote_code=True, truncate_dim=embed_dim)
29
+ docs_embed = model.encode(paragraphs, normalize_embeddings=True)
30
+ query_embed = model.encode(query, normalize_embeddings=True)
31
+ similarities = np.dot(docs_embed, query_embed.T)
32
+ top_3_idx = np.argsort(similarities, axis=0)[-3:][::-1].tolist()
33
+ most_similar_documents = [paragraphs[idx] for idx in top_3_idx]
34
 
35
+ CONTEXT = ""
36
+ for p in most_similar_documents:
37
+ wrapped_text = textwrap.fill(p, width=100)
38
+ CONTEXT += wrapped_text + "\n\n"
39
 
40
+ prompt = f"""
41
+ use the following CONTEXT to answer the QUESTION at the end.
42
+ If you don't know the answer, just say that you don't know, don't try to make up an answer.
43
+ CONTEXT: {CONTEXT}
44
+ QUESTION: {query}
45
+ """
46
 
47
+ if mode == "OpenAI":
48
+ client = OpenAI(api_key=api_key)
49
+ response = client.chat.completions.create(
50
+ model="gpt-4",
51
+ messages=[
52
+ {"role": "user", "content": prompt},
53
+ ]
54
+ )
55
+ responses[model_name] = response.choices[0].message.content
56
 
57
+ elif mode == "OpenSource":
58
+ tokenizer = AutoTokenizer.from_pretrained("google/gemini-2b")
59
+ model = AutoModelForCausalLM.from_pretrained("google/gemini-2b")
60
+ generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
61
+ response = generator(prompt, max_length=512, num_return_sequences=1)
62
+ responses[model_name] = response[0]['generated_text']
63
+
64
+ return "\n\n".join([f"Model: {model_name}\nResponse: {response}" for model_name, response in responses.items()])
65
 
66
  # Define the interface
67
  wiki_page_input = gr.Textbox(label="Wikipedia Page (in Arabic)")
68
  query_input = gr.Textbox(label="Query (in Arabic)")
69
+ api_key_input = gr.Textbox(label="OpenAI API Key", type="password", visible=False)
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  embed_dim_choice = gr.Dropdown(
72
  choices=[768, 512, 256, 128, 64],
73
  label="Embedding Dimension"
74
  )
75
 
76
+ mode_choice = gr.Radio(
77
+ choices=["OpenAI", "OpenSource"],
78
+ label="Choose Mode"
79
+ )
80
+
81
  output_text = gr.Textbox(label="Output")
82
 
83
+ def on_mode_change(mode):
84
+ if mode == "OpenAI":
85
+ api_key_input.visible = True
86
+ else:
87
+ api_key_input.visible = False
88
+
89
+ mode_choice.change(on_mode_change, inputs=mode_choice, outputs=api_key_input)
90
+
91
  gr.Interface(
92
  fn=process_query,
93
+ inputs=[wiki_page_input, embed_dim_choice, query_input, api_key_input, mode_choice],
94
  outputs=output_text,
95
  title="Arabic Wiki RAG",
96
  description="Choose a Wikipedia page, embedding model, and dimension to answer a query in Arabic."