File size: 5,027 Bytes
724f682
 
 
 
 
 
 
 
 
 
9d45d9e
724f682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae79188
724f682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae79188
 
724f682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import os
from langchain.prompts import PromptTemplate
from langchain.output_parsers.list import NumberedListOutputParser
from langchain.output_parsers import ResponseSchema, StructuredOutputParser
from langchain_community.chat_models import ChatOpenAI
import serpapi

model_name = 'gpt-4-0125-preview'

openai_key = os.getenv("OPENAI_API_KEY")
serpapi_key = os.getenv("SERP_API_KEY")
# generate search terms using OpenAI
def generate_search_terms(input_text: str, number_of_generated_search_terms):
    llm = ChatOpenAI(model_name=model_name, temperature=0.0)
    output_parser = NumberedListOutputParser()
    format_instructions = output_parser.get_format_instructions()
    prompt = PromptTemplate(
        template="As a search specialist with expertise in optimizing searches in the Google Patents database, your task is to generate " + str(number_of_generated_search_terms) + " optimal keyword or keyword list like single and multiple keywords(please choose correct terms, i want to get at least 10 results for each query, don't be too specific) like, `(rabbit toy), (coffee brew) AND (pot) OR (top), (stabilization system), (vr heading) OR (logic freq)`, so dont use \" or ' use only phranthesis,  searches to find similar patents for the following invention idea: ---BEGINNING--- `{user_input}` ---END--- {format_instructions}\n",
        input_variables=["user_input"],
        partial_variables={"format_instructions": format_instructions}
    )

    output = llm.predict(text=prompt.format(user_input=input_text))
    output_list = output_parser.parse(output)
    return output_list

# search Google Patents using SerpApi
def search_on_google_patents(terms: list):
    # multiple_queries = ';'.join(terms)
    search_terms_patents ={}
    for search_term in terms:
        params = {
            "engine": "google_patents",
            "q": search_term,
            "clustered": "true",
            "scholar": "true",
            "api_key": serpapi_key
        }
        results = serpapi.search(params)
        if results.get('error', False):
            raise results['error']
        organic_results = results["organic_results"]

        patents = []
        for result in organic_results:
            if "patent_id" in result:
                patent = {
                    "patentTitle": result["title"],
                    "patentNumber": result["publication_number"],
                    "inventors": [result["inventor"]],
                    "assignee": result["assignee"],
                    "abstract": result["snippet"],
                    "publicationDate": result["publication_date"],
                    "filingDate": result["filing_date"],
                    "patentUrl": result["serpapi_link"]
                }
                patents.append(patent)
        search_terms_patents[search_term] = patents
    return search_terms_patents

# check similarity of patents using OpenAI
def check_similarity_of_patents(input_text, patents: list):
    llm = ChatOpenAI(model_name=model_name, temperature=0.0)
    
    response_schemas = [
        ResponseSchema(
            name="listOfPatents",
            description="List of dicts of patentTitle, patentNumber and similarityScore (score over 100): [{patentTitle: string, patentNumber: string, similarityScore: number}]",
            type="array(objects)"
            )
    ]
    output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
    format_instructions = output_parser.get_format_instructions()
    prompt = PromptTemplate(
        template="Could you please generate a semantic similarity score out of 100 for the following patent information {user_input}, comparing it with the following abstracts:" + '\n'.join([f"\n===BEGINNING=== {i+1} - {patent['patentTitle']} - {patent['patentNumber']} - {patent['abstract']} ===END===" for i, patent in enumerate(patents)]) + "\n{format_instructions}\n",
        input_variables=["user_input"],
        partial_variables={"format_instructions": format_instructions}
    )
    output = llm.predict(text=prompt.format(user_input=input_text))
    output_list = output_parser.parse(output)
    return output_list

# merge patents with similarity data
def merge_patents_with_similarity(patents, similarity_data):
    merged_list = []
    for patent in patents:
        patent_number = patent['patentNumber']
        for similarity_patent in similarity_data['listOfPatents']:
            if similarity_patent['patentNumber'] == patent_number:
                patent['similarityScore'] = similarity_patent['similarityScore']
                patent['patentGoogleUrl'] = f"https://patents.google.com/patent/{patent_number}"
                break
        merged_list.append(patent)
    merged_list = sorted(merged_list, key=lambda x: x['similarityScore'], reverse=True)
    return list(merged_list)

# sort patents by similarity score
def sort_patents_by_similarity_score(data):
    sorted_patents = sorted(data, key=lambda x: x['similarityScore'], reverse=True)
    return sorted_patents