Spaces:

kltn20133118
/

demo_obsei

Sleeping

App Files Files Community

demo_obsei / pipe_line_obsei.py

kltn20133118

Upload 4 files

c85c606 verified about 2 months ago

raw

history blame contribute delete

6.86 kB

	import logging
	import json
	from transformers import pipeline
	from obsei_module.obsei.payload import TextPayload
	from obsei_module.obsei.analyzer.classification_analyzer import ZeroShotClassificationAnalyzer, ClassificationAnalyzerConfig
	from obsei_module.obsei.source.website_crawler_source import TrafilaturaCrawlerConfig, TrafilaturaCrawlerSource
	from obsei_module.obsei.sink.http_sink import HttpSinkConfig, HttpSink
	from obsei_module.obsei.analyzer.sentiment_analyzer import *
	import connect_mongo
	import pymongo


	async def get_object_by_link(db_name, collection_name, link):
	collection = connect_mongo.connect_to_mongo(db_name, collection_name)

	if collection is None:
	print("Failed to connect to MongoDB.")
	return None

	result = collection.find_one({"link": link})

	# if result:
	# print(f"Object found: {result}")
	# else:
	# print(f"No object found for the link: {link}")

	return result


	# Hàm kết nối và kiểm tra bản ghi trong DB khác
	async def get_existing_record_by_name(db_name, collection_name, name, link):
	collection = connect_mongo.connect_to_mongo(db_name, collection_name)

	if collection is None:
	print("Failed to connect to MongoDB.")
	return None

	# Kiểm tra bản ghi có cùng name và link hay không
	result = collection.find_one({"name": name, "link": link})

	# if result:
	# print(f"Existing record found for name: {name} and link: {link}")
	# else:
	# print(f"No existing record found for name: {name} and link: {link}")

	return result

	# Hàm lưu processed_text vào MongoDB với kiểm tra name
	async def save_processed_text_to_mongo(db_name, collection_name, link, processed_text, name=None):
	collection = connect_mongo.connect_to_mongo(db_name, collection_name)

	if collection is None:
	print("Failed to connect to MongoDB.")
	return None

	# Tạo bản ghi mới hoặc cập nhật bản ghi cũ
	document = {
	"link": link,
	"processed_text": processed_text,
	"name": name, # Thêm name cho bản ghi mới
	}



	# Kiểm tra sự tồn tại của bản ghi trong DB dự phòng
	existing_record = await get_existing_record_by_name(db_name, collection_name, name, link)

	if existing_record:
	# Nếu bản ghi tồn tại, cập nhật
	result = collection.update_one(
	{"_id": existing_record["_id"]},
	{
	"$set": document
	}
	)
	if result.modified_count > 0:
	print(f"Successfully updated the record for {link}.")
	else:
	print(f"No changes made to the record for {link}.")
	else:
	# Nếu bản ghi chưa tồn tại, tạo mới
	result = collection.insert_one(document)
	print(f"Successfully inserted a new record for {link}.")

	return result

	# Hàm xử lý URL, lấy dữ liệu, phân tích và lưu processed_text vào MongoDB
	async def process_url(url: str, db_name: str, collection_name: str,backup_db_name: str, backup_collection_name: str):
	"""Crawl data from the URL and analyze it with a Zero-shot classification model."""
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Bước 1: Crawl dữ liệu từ URL
	crawler_config = TrafilaturaCrawlerConfig(
	urls=[url]
	)
	crawler = TrafilaturaCrawlerSource()

	crawled_data = crawler.lookup(config=crawler_config)
	if not crawled_data:
	logger.error("No data found from crawler")
	return {"error": "No data found from crawler"}


	# Bước 2: Cấu hình phân tích với Zero-shot classification
	analyzer_config = ClassificationAnalyzerConfig(
	labels=["Sports", "Politics", "Technology", "Entertainment"],
	multi_class_classification=False,
	add_positive_negative_labels=False
	)

	analyzer = ZeroShotClassificationAnalyzer(
	model_name_or_path="facebook/bart-large-mnli",
	device="auto"
	)

	# Phân tích dữ liệu crawled_data
	analysis_results = analyzer.analyze_input(
	source_response_list=crawled_data,
	analyzer_config=analyzer_config
	)
	# transformers_analyzer_config = TransformersSentimentAnalyzerConfig(
	# labels=["positive", "negative"],
	# multi_class_classification=False,
	# add_positive_negative_labels=True
	# )
	# transformers_analyzer = TransformersSentimentAnalyzer(model_name_or_path="facebook/bart-large-mnli", device="auto")
	# transformers_results = transformers_analyzer.analyze_input(crawled_data, analyzer_config=transformers_analyzer_config)

	# Cấu hình HttpSink (nếu cần gửi dữ liệu đi nơi khác)
	http_sink_config = HttpSinkConfig(
	url="https://httpbin.org/post",
	headers={"Content-type": "application/json"},
	base_payload={"common_field": "test cua VNY"},
	)

	http_sink = HttpSink()

	responses = http_sink.send_data(analysis_results, http_sink_config)

	response_data = []
	for i, response in enumerate(responses):
	response_content = response.read().decode("utf-8")
	response_json = json.loads(response_content)

	response_data.append({
	"response_index": i + 1,
	"content": response_json,
	"status_code": response.status,
	"headers": dict(response.getheaders())
	})

	content_data = [item["content"] for item in response_data]
	processed_text = content_data[0].get("json", {}).get('segmented_data', 'No processed text available.')
	processed_text1 = content_data[0].get("json", {})
	content_data = processed_text1.get('meta').get('text')
	existing_record = await get_object_by_link(db_name, collection_name, url)
	if existing_record:
	existing_segmented_data = existing_record.get("segmented_data", {})
	existing_segmented_data.update({"new_analysis": processed_text})
	await save_processed_text_to_mongo(backup_db_name, backup_collection_name, url, processed_text, name="Sentiment Analysis")
	else:
	await save_processed_text_to_mongo(backup_db_name, backup_collection_name, url, processed_text, name="Sentiment Analysis")

	return processed_text,content_data

	# url = "https://laodong.vn/bong-da-quoc-te/vua-phat-goc-nicolas-jover-la-vo-gia-voi-arsenal-1431091.ldo"
	# db_name = "test"
	# import asyncio
	# collection_name = "articles"
	# backup_db_name = "backup_test"
	# backup_collection_name = "articles_analysis"
	# processed_text = asyncio.run(process_url(url, db_name, collection_name, backup_db_name, backup_collection_name))