import json import jieba import re import requests import backoff @backoff.on_exception(backoff.expo, requests.exceptions.RequestException) def post_url(url, headers, payload): response = requests.request("POST", url, headers=headers, data=payload) return response def seg(text): sentences = re.split(r'(?<=[。!?])\s*', text) return sentences def clean_text(text): text = text.replace('\n', " ") text = re.sub(r"-", " ", text) text = re.sub(r"\d+/\d+/\d+", "", text) # 日期 text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) # 时间 text = re.sub( r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) # 网址 pure_text = '' for letter in text: if letter.isalpha() or letter == ' ': pure_text += letter text = ' '.join(word for word in pure_text.split() if len(word) > 1) return text def article_to_group(groups, topics): para = {} for i in groups: if not i[1] in para: para[i[1]] = i[0] else: para[i[1]] = para[i[1]] + i[0] return para def generation(para, max_length): API_KEY = "IZt1uK9PAI0LiqleqT0cE30b" SECRET_KEY = "Xv5kHB8eyhNuI1B1G7fRgm2SIPdlxGxs" def get_access_token(): url = "https://aip.baidubce.com/oauth/2.0/token" params = {"grant_type": "client_credentials", "client_id": API_KEY, "client_secret": SECRET_KEY} return str(requests.post(url, params=params).json().get("access_token")) url = "https://aip.baidubce.com/rpc/2.0/nlp/v1/news_summary?charset=UTF-8&access_token=" + get_access_token() topic = {} for i, (j, k) in enumerate(para.items()): input_text = k # print(k) payload = json.dumps({ "content": k, "max_summary_len": max_length }) headers = { 'Content-Type': 'application/json', 'Accept': 'application/json' } response = post_url(url, headers, payload) text_dict = json.loads(response.text) # print(text_dict) topic[j] = (text_dict['summary'], k) return topic