myezrag

Sleeping

App Files Files Community

ginipick commited on Oct 26, 2024

Commit

b2e08df

verified ·

1 Parent(s): d2a7d2b

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -284

app.py CHANGED Viewed

@@ -7,9 +7,49 @@ import json
 import io
 import traceback
 import csv
-# HuggingFace 클라이언트 대신 OpenAI 클라이언트 사용
 from openai import OpenAI
-import os
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
@@ -34,88 +74,24 @@ def load_parquet(filename: str) -> str:
     except Exception as e:
         return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"
-# OpenAI 클라이언트 설정
-client = OpenAI(api_key=os.getenv("OPEN_AI"))
-# respond 함수 수정
-def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
-    # 시스템 프롬프트 설정
-    system_prefix = """반드시 한글로 답변할 것. 너는 업로드된 데이터를 기반으로 질문에 답변하는 역할을 한다.
-주요 지침:
-1. 질문과 직접 관련된 내용만 간단명료하게 답변할 것
-2. 이전 답변과 중복되는 내용은 제외할 것
-3. 불필요한 예시나 부연 설명은 하지 말 것
-4. 동일한 내용을 다른 표현으로 반복하지 말 것
-5. 핵심 정보만 전달할 것
-"""
-    if parquet_data:
-        try:
-            df = pd.read_json(io.StringIO(parquet_data))
-            data_summary = df.describe(include='all').to_string()
-            system_prefix += f"\n\n데이터 요약:\n{data_summary}"
-        except Exception as e:
-            print(f"데이터 로드 오류: {str(e)}")
-    # 대화 히스토리 구성
-    messages = [{"role": "system", "content": system_prefix}]
-    # 최근 대화 컨텍스트만 유지
-    recent_history = history[-3:] if history else []
-    for chat in recent_history:
-        messages.append({"role": chat["role"], "content": chat["content"]})
-    messages.append({"role": "user", "content": message})
-    try:
-        # OpenAI API 호출
-        response = client.chat.completions.create(
-            model="gpt-4o-mini",  # GPT-4-mini 모델 사용
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=True
-        )
-        full_response = ""
-        for chunk in response:
-            if chunk.choices[0].delta.content:
-                full_response += chunk.choices[0].delta.content
-                # 응답 정제
-                cleaned_response = clean_response(full_response)
-                yield cleaned_response
-    except Exception as e:
-        error_message = f"추론 오류: {str(e)}"
-        print(error_message)
-        yield error_message
 def clean_response(text: str) -> str:
     """응답 텍스트 정제 함수"""
-    # 문장 단위로 분리
     sentences = [s.strip() for s in text.split('.') if s.strip()]
-    # 중복 제거
     unique_sentences = []
     seen = set()
     for sentence in sentences:
-        # 문장 정규화 (공백 제거, 소문자 변환)
         normalized = ' '.join(sentence.lower().split())
         if normalized not in seen:
             seen.add(normalized)
             unique_sentences.append(sentence)
-    # 정제된 문장 결합
     cleaned_text = '. '.join(unique_sentences)
     if cleaned_text and not cleaned_text.endswith('.'):
         cleaned_text += '.'
     return cleaned_text
 def remove_duplicates(text: str) -> str:
     """중복 문장 제거 함수"""
     sentences = text.split('.')
@@ -132,20 +108,17 @@ def remove_duplicates(text: str) -> str:
 def upload_csv(file_path: str) -> Tuple[str, str]:
     try:
-        # CSV 파일 읽기
         df = pd.read_csv(file_path, sep=',')
-        # 필수 컬럼 확인
         required_columns = {'id', 'text', 'label', 'metadata'}
         available_columns = set(df.columns)
         missing_columns = required_columns - available_columns
         if missing_columns:
             return f"CSV 파일에 다음 필수 컬럼이 누락되었습니다: {', '.join(missing_columns)}", ""
-        # 데이터 클렌징
         df.drop_duplicates(inplace=True)
         df.fillna('', inplace=True)
-        # 데이터 유형 최적화
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
-        # Parquet 파일로 변환
         parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         return f"{parquet_filename} 파일이 성공적으로 업로드되고 변환되었습니다.", parquet_filename
@@ -154,10 +127,8 @@ def upload_csv(file_path: str) -> Tuple[str, str]:
 def upload_parquet(file_path: str) -> Tuple[str, str, str]:
     try:
-        # Parquet 파일 읽기
         df = pd.read_parquet(file_path, engine='pyarrow')
-        # 데이터 기본 정보 수집
         data_info = {
             "총 레코드 수": len(df),
             "컬럼 목록": list(df.columns),
@@ -165,143 +136,53 @@ def upload_parquet(file_path: str) -> Tuple[str, str, str]:
             "결측치 정보": df.isnull().sum().to_dict()
         }
-        # 데이터 요약 정보 생성
         summary = []
         summary.append(f"### 데이터셋 기본 정보:")
         summary.append(f"- 총 레코드 수: {data_info['총 레코드 수']}")
         summary.append(f"- 컬럼 목록: {', '.join(data_info['컬럼 목록'])}")
-        # 각 컬럼별 통계 정보 생성
         summary.append("\n### 컬럼별 정보:")
         for col in df.columns:
             if df[col].dtype in ['int64', 'float64']:
-                # 수치형 데이터
                 stats = df[col].describe()
                 summary.append(f"\n{col} (수치형):")
                 summary.append(f"- 평균: {stats['mean']:.2f}")
                 summary.append(f"- 최소: {stats['min']}")
                 summary.append(f"- 최대: {stats['max']}")
             elif df[col].dtype == 'object' or df[col].dtype == 'string':
-                # 문자열 데이터
                 unique_count = df[col].nunique()
                 summary.append(f"\n{col} (텍스트):")
                 summary.append(f"- 고유값 수: {unique_count}")
-                if unique_count < 10:  # 고유값이 적은 경우만 표시
                     value_counts = df[col].value_counts().head(5)
                     summary.append("- 상위 5개 값:")
                     for val, count in value_counts.items():
                         summary.append(f"  • {val}: {count}개")
-        # 미리보기 생성
         preview = df.head(10).to_markdown(index=False)
         summary.append("\n### 데이터 미리보기:")
         summary.append(preview)
         parquet_content = "\n".join(summary)
-        # DataFrame을 JSON 문자열로 변환 (Q&A에서 사용)
         parquet_json = df.to_json(orient='records', force_ascii=False)
         return "Parquet 파일이 성공적으로 업로드되었습니다.", parquet_content, parquet_json
     except Exception as e:
         return f"Parquet 파일 업로드 중 오류가 발생했습니다: {str(e)}", "", ""
-def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None) -> str:
-    try:
-        if parquet_data:
-            # JSON 문자열을 DataFrame으로 변환
-            df = pd.read_json(io.StringIO(parquet_data))
-            # 데이터셋 컨텍스트 생성
-            columns_info = []
-            for col in df.columns:
-                if df[col].dtype in ['int64', 'float64']:
-                    col_type = "수치형"
-                    stats = df[col].describe()
-                    col_info = f"- {col} ({col_type}): 평균={stats['mean']:.2f}, 최소={stats['min']}, 최대={stats['max']}"
-                else:
-                    col_type = "텍스트"
-                    unique_count = df[col].nunique()
-                    col_info = f"- {col} ({col_type}): 고유값 {unique_count}개"
-                columns_info.append(col_info)
-            data_context = f"""
-현재 업로드된 데이터셋 정보:
-- 총 {len(df)} 개의 레코드
-- 컬럼 정보:
-{chr(10).join(columns_info)}
-샘플 데이터:
-{df.head(20).to_string()}
-"""
-            system_prompt = f"""당신은 업로드된 데이터셋을 분석하고 질문에 답변하는 AI 어시스턴트입니다.
-주요 지침:
-1. 반드시 한글로 답변할 것
-2. 데이터셋의 실제 내용을 기반으로 정확하게 답변할 것
-3. 데이터에 없는 내용은 추측하지 말 것
-4. 답변은 간단명료하게 할 것
-5. 데이터 프라이버시를 고려하여 답변할 것
-데이터셋 구조 설명:
-{chr(10).join(columns_info)}
-참고할 데이터 샘플:
-{data_context}
-"""
-        else:
-            system_prompt = system_message or "너는 AI 조언자 역할이다."
-        # OpenAI API 호출
-        messages = [{"role": "system", "content": system_prompt}]
-        # 최근 대화 기록 추가
-        recent_history = history[-3:] if history else []
-        for chat in recent_history:
-            messages.append({"role": chat["role"], "content": chat["content"]})
-        messages.append({"role": "user", "content": message})
-        response = client.chat.completions.create(
-            model="gpt-4-0125-preview",
-            messages=messages,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=True
-        )
-        full_response = ""
-        for chunk in response:
-            if chunk.choices[0].delta.content:
-                full_response += chunk.choices[0].delta.content
-                yield clean_response(full_response)
-    except Exception as e:
-        error_message = f"응답 생성 중 오류 발생: {str(e)}"
-        print(f"{error_message}\n{traceback.format_exc()}")
-        yield error_message
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
-        # 입력 텍스트를 줄 단위로 분리
         lines = [line.strip() for line in text.split('\n') if line.strip()]
-        # 데이터를 저장할 리스트
         data = []
         for line in lines:
             try:
-                # 정규식을 사용하여 CSV 형식 파싱
                 import re
                 pattern = r'(\d+),([^,]+),([^,]+),(.+)'
                 match = re.match(pattern, line)
                 if match:
                     id_val, text_val, label_val, metadata_val = match.groups()
-                    # 쌍따옴표 제거 및 정제
                     text_val = text_val.strip().strip('"')
                     label_val = label_val.strip().strip('"')
                     metadata_val = metadata_val.strip().strip('"')
@@ -319,10 +200,7 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
         if not data:
             return "변환할 데이터가 없습니다.", "", ""
-        # DataFrame 생성
         df = pd.DataFrame(data)
-        # 데이터 타입 설정
         df = df.astype({
             'id': 'int32',
             'text': 'string',
@@ -330,11 +208,8 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
             'metadata': 'string'
         })
-        # Parquet 파일로 변환
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
-        # 미리보기 생성
         preview = df.to_markdown(index=False)
         return (
@@ -348,34 +223,46 @@ def text_to_parquet(text: str) -> Tuple[str, str, str]:
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
-# preprocess_text_with_llm 함수도 수정
-def preprocess_text_with_llm(input_text: str) -> str:
-    if not input_text.strip():
-        return "입력 텍스트가 비어있습니다."
-    system_prompt = """반드시 한글(한국어)로 답변하시오. 당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.
-규칙:
-1. 출력 형식: id,text,label,metadata
-2. id: 1부터 시작하는 순차적 번호
-3. text: 의미 있는 단위로 분리된 텍스트
-4. label: 텍스트의 주제나 카테고리를 아래 기준으로 정확하게 한 개만 선택
-   - Historical_Figure (역사적 인물)
-   - Military_History (군사 역사)
-   - Technology (기술)
-   - Politics (정치)
-   - Culture (문화)
-5. metadata: 날짜, 출처 등 추가 정보"""
     try:
         response = client.chat.completions.create(
             model="gpt-4-0125-preview",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": input_text}
-            ],
-            max_tokens=4000,
-            temperature=0.1,
             stream=True
         )
@@ -383,26 +270,19 @@ def preprocess_text_with_llm(input_text: str) -> str:
         for chunk in response:
             if chunk.choices[0].delta.content:
                 full_response += chunk.choices[0].delta.content
-        # 응답 정제
-        processed_text = clean_response(full_response)
-        # CSV 형식 검증
-        try:
-            from io import StringIO
-            import csv
-            csv.reader(StringIO(processed_text))
-            return processed_text
-        except csv.Error:
-            return "LLM이 올바른 CSV 형식을 생성하지 못했습니다. 다시 시도해주세요."
     except Exception as e:
-        error_message = f"전처리 중 오류가 발생했습니다: {str(e)}"
-        print(error_message)
-        return error_message# preprocess_text_with_llm 함수도 수정
-def preprocess_text_with_llm(input_text: str) -> str:
-    if not input_text.strip():
-        return "입력 텍스트가 비어있습니다."
     system_prompt = """반드시 한글(한국어)로 답변하시오. 당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.
@@ -420,7 +300,7 @@ def preprocess_text_with_llm(input_text: str) -> str:
     try:
         response = client.chat.completions.create(
-            model="gpt-4o-mini",
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": input_text}
@@ -435,10 +315,8 @@ def preprocess_text_with_llm(input_text: str) -> str:
             if chunk.choices[0].delta.content:
                 full_response += chunk.choices[0].delta.content
-        # 응답 정제
         processed_text = clean_response(full_response)
-        # CSV 형식 검증
         try:
             from io import StringIO
             import csv
@@ -452,46 +330,50 @@ def preprocess_text_with_llm(input_text: str) -> str:
         print(error_message)
         return error_message
-# CSS 설정
-css = """
-footer {
-    visibility: hidden;
-}
-#chatbot-container, #chatbot-data-upload {
-    height: 700px;
-    overflow-y: scroll;
-}
-#chatbot-container .message, #chatbot-data-upload .message {
-    font-size: 14px;
-}
-/* 입력창 배경색 및 글자색 변경 */
-textarea, input[type="text"] {
-    background-color: #ffffff; /* 흰색 배경 */
-    color: #000000; /* 검정색 글자 */
-}
-/* 파일 업로드 영역 높이 조절 */
-#parquet-upload-area {
-    max-height: 150px;
-    overflow-y: auto;
-}
-/* 초기 설명 글씨 크기 조절 */
-#initial-description {
-    font-size: 14px;
-}
-"""
 # Gradio Blocks 인터페이스 설정
 with gr.Blocks(css=css) as demo:
     gr.Markdown("# MyEzRAG: LLM이 나만의 데이터로 학습한 콘텐츠 생성/답변", elem_id="initial-description")
     gr.Markdown(
         "### '사용 방법' 탭을 통해 자세한 이용 방법을 참고하세요.\n"
         "### Tip) '예제'를 통해 다양한 활용 방법을 체험하고 응용해 보세요, 데이터셋 업로드시 미리보기는 10건만 출력",
         elem_id="initial-description"
     )
-    # 첫 번째 탭: 챗봇 데이터 업로드 (탭 이름 변경: "My 데이터셋+LLM")
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
         chatbot_data_upload = gr.Chatbot(label="챗봇", type="messages", elem_id="chatbot-data-upload")
@@ -506,10 +388,14 @@ with gr.Blocks(css=css) as demo:
         parquet_data_state = gr.State()
-        def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str):
             history = history or []
-            # 중복 질문 검사
             recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user']
             if message.strip().lower() in recent_questions:
                 yield history + [{"role": "assistant", "content": "동일한 질문이 최근에 있었습니다. 다른 질문을 해주세요."}], ""
@@ -522,9 +408,10 @@ with gr.Blocks(css=css) as demo:
                     history,
                     system_message,
                     max_tokens,
-                    temperature=0.3,  # 낮은 temperature 사용
                     top_p=top_p,
-                    parquet_data=parquet_data
                 )
                 partial_response = ""
@@ -539,9 +426,6 @@ with gr.Blocks(css=css) as demo:
                 history.append({"role": "assistant", "content": response})
                 yield history, ""
         send_data_upload.click(
             handle_message_data_upload,
             inputs=[
@@ -551,13 +435,14 @@ with gr.Blocks(css=css) as demo:
                 max_tokens,
                 temperature,
                 top_p,
-                parquet_data_state,  # parquet_data_state를 사용하여 업로드된 데이터를 전달
             ],
             outputs=[chatbot_data_upload, msg_data_upload],
             queue=True
         )
-        # 예제 추가
         with gr.Accordion("예제", open=False):
             gr.Examples(
                 examples=[
@@ -572,7 +457,7 @@ with gr.Blocks(css=css) as demo:
                 label="예제 선택",
             )
-        # Parquet 파일 업로드를 화면 하단으로 이동
         gr.Markdown("### Parquet 파일 업로드")
         with gr.Row():
             with gr.Column():
@@ -596,7 +481,7 @@ with gr.Blocks(css=css) as demo:
                     outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
                 )
-    # 두 번째 탭: 데이터 변환 (탭 이름 변경: "CSV to My 데이터셋")
     with gr.Tab("CSV to My 데이터셋"):
         gr.Markdown("### CSV 파일 업로드 및 Parquet 변환")
         with gr.Row():
@@ -621,7 +506,7 @@ with gr.Blocks(css=css) as demo:
                     outputs=[upload_status, parquet_preview, download_button]
                 )
-    # 세 번째 탭: 텍스트 to csv to parquet 변환 (탭 이름 변경: "Text to My 데이터셋")
     with gr.Tab("Text to My 데이터셋"):
         gr.Markdown("### 텍스트를 입력하면 CSV로 변환 후 Parquet으로 자동 전환됩니다.")
         with gr.Row():
@@ -649,7 +534,7 @@ with gr.Blocks(css=css) as demo:
                     outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
                 )
-    # 네번째 탭의 UI 부분 수정
     with gr.Tab("Text Preprocessing with LLM"):
         gr.Markdown("### 텍스트를 입력하면 LLM이 데이터셋 형식에 맞게 전처리하여 출력합니다.")
         with gr.Row():
@@ -676,33 +561,29 @@ with gr.Blocks(css=css) as demo:
                     interactive=False
                 )
-                # Parquet 변환 및 다운로드 섹션
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
-                def handle_text_preprocessing(input_text: str):
                     if not input_text.strip():
-                        return "입력 텍스트가 없습니다.", ""
                     try:
-                        preprocess_status_msg = "전처리를 시작합니다..."
-                        yield preprocess_status_msg, ""
-                        processed_text = preprocess_text_with_llm(input_text)
                         if processed_text:
-                            preprocess_status_msg = "전처리가 완료되었습니다."
-                            yield preprocess_status_msg, processed_text
                         else:
-                            preprocess_status_msg = "전처리 결과가 없습니다."
-                            yield preprocess_status_msg, ""
                     except Exception as e:
-                        error_msg = f"처리 중 오류가 발생했습니다: {str(e)}"
-                        yield error_msg, ""
                 def clear_inputs():
                     return "", "대기 중...", ""
@@ -719,10 +600,9 @@ with gr.Blocks(css=css) as demo:
                     except Exception as e:
                         return f"Parquet 변환 중 오류 발생: {str(e)}", None
-                # 이벤트 핸들러 연결
                 preprocess_button.click(
                     handle_text_preprocessing,
-                    inputs=[raw_text_input],
                     outputs=[preprocess_status, processed_text_output],
                     queue=True
                 )
@@ -738,7 +618,6 @@ with gr.Blocks(css=css) as demo:
                     outputs=[preprocess_status, download_parquet]
                 )
-                # 예제 텍스트 추가
                 with gr.Accordion("예제 텍스트", open=False):
                     gr.Examples(
                         examples=[
@@ -749,12 +628,17 @@ with gr.Blocks(css=css) as demo:
                         label="예제 선택"
                     )
     with gr.Tab("📚 사용 방법"):
         gr.Markdown("""
         # MyEzRAG 사용 가이드
         ## 1️⃣ My 데이터셋+LLM 탭
-        ![Tab1](https://your-image-url.com/tab1.png)
         ### 기능
         - 업로드된 Parquet 데이터셋을 기반으로 LLM과 대화
         - 데이터셋의 내용을 활용한 콘텐츠 생성
@@ -771,7 +655,6 @@ with gr.Blocks(css=css) as demo:
         ---
         ## 2️⃣ CSV to My 데이터셋 탭
-        ![Tab2](https://your-image-url.com/tab2.png)
         ### 기능
         - CSV 파일을 Parquet 형식으로 변환
         - 데이터 최적화 및 정제
@@ -788,7 +671,6 @@ with gr.Blocks(css=css) as demo:
         ---
         ## 3️⃣ Text to My 데이터셋 탭
-        ![Tab3](https://your-image-url.com/tab3.png)
         ### 기능
         - 텍스트 형식의 데이터를 Parquet으로 변환
         - 수동 데이터 입력 지원
@@ -811,7 +693,6 @@ with gr.Blocks(css=css) as demo:
         ---
         ## 4️⃣ Text Preprocessing with LLM 탭
-        ![Tab4](https://your-image-url.com/tab4.png)
         ### 기능
         - LLM을 활용한 자동 텍스트 전처리
         - 구조화된 데이터셋 생성
@@ -828,26 +709,28 @@ with gr.Blocks(css=css) as demo:
         - 데이터 정규화
         ## 💡 일반적인 팁
         - 각 탭의 예제를 참고하여 사용법 ��히기
         - 데이터 품질이 좋을수록 더 나은 결과 제공
         - 오류 발생 시 입력 데이터 형식 확인
         - 대용량 처리 시 적절한 청크 크기로 분할 처리
         ## ⚠️ 주의사항
         - 민감한 개인정보 포함하지 않기
         - 데이터 백업 권장
         - 네트워크 상태 확인
         - 브라우저 캐시 주기적 정리
         ## 🔍 문제 해결
         - 오류 발생 시 입력 데이터 형식 확인
         - 파일 업로드 실패 시 파일 크기 및 형식 확인
         - 변환 실패 시 데이터 인코딩 확인
         - 응답이 느릴 경우 데이터 크기 조정
         """)
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
-    demo.launch(share=True)

 import io
 import traceback
 import csv
 from openai import OpenAI
+from functools import lru_cache
+from concurrent.futures import ThreadPoolExecutor
+import math
+# CSS 설정
+css = """
+footer {
+    visibility: hidden;
+}
+#chatbot-container, #chatbot-data-upload {
+    height: 700px;
+    overflow-y: scroll;
+}
+#chatbot-container .message, #chatbot-data-upload .message {
+    font-size: 14px;
+}
+/* 입력창 배경색 및 글자색 변경 */
+textarea, input[type="text"] {
+    background-color: #ffffff;
+    color: #000000;
+}
+/* 파일 업로드 영역 높이 조절 */
+#parquet-upload-area {
+    max-height: 150px;
+    overflow-y: auto;
+}
+/* 초기 설명 글씨 크기 조절 */
+#initial-description {
+    font-size: 14px;
+}
+/* API Key 입력 섹션 스타일 */
+.api-key-section {
+    margin: 10px 0;
+    padding: 10px;
+    border: 1px solid #ddd;
+    border-radius: 5px;
+}
+.api-key-status {
+    margin-top: 5px;
+    font-weight: bold;
+}
+"""
 # 추론 API 클라이언트 설정
 hf_client = InferenceClient(
     except Exception as e:
         return f"파일을 읽는 중 오류가 발생했습니다: {str(e)}"
 def clean_response(text: str) -> str:
     """응답 텍스트 정제 함수"""
     sentences = [s.strip() for s in text.split('.') if s.strip()]
     unique_sentences = []
     seen = set()
     for sentence in sentences:
         normalized = ' '.join(sentence.lower().split())
         if normalized not in seen:
             seen.add(normalized)
             unique_sentences.append(sentence)
     cleaned_text = '. '.join(unique_sentences)
     if cleaned_text and not cleaned_text.endswith('.'):
         cleaned_text += '.'
     return cleaned_text
 def remove_duplicates(text: str) -> str:
     """중복 문장 제거 함수"""
     sentences = text.split('.')
 def upload_csv(file_path: str) -> Tuple[str, str]:
     try:
         df = pd.read_csv(file_path, sep=',')
         required_columns = {'id', 'text', 'label', 'metadata'}
         available_columns = set(df.columns)
         missing_columns = required_columns - available_columns
         if missing_columns:
             return f"CSV 파일에 다음 필수 컬럼이 누락되었습니다: {', '.join(missing_columns)}", ""
         df.drop_duplicates(inplace=True)
         df.fillna('', inplace=True)
         df = df.astype({'id': 'int32', 'text': 'string', 'label': 'category', 'metadata': 'string'})
         parquet_filename = os.path.splitext(os.path.basename(file_path))[0] + '.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         return f"{parquet_filename} 파일이 성공적으로 업로드되고 변환되었습니다.", parquet_filename
 def upload_parquet(file_path: str) -> Tuple[str, str, str]:
     try:
         df = pd.read_parquet(file_path, engine='pyarrow')
         data_info = {
             "총 레코드 수": len(df),
             "컬럼 목록": list(df.columns),
             "결측치 정보": df.isnull().sum().to_dict()
         }
         summary = []
         summary.append(f"### 데이터셋 기본 정보:")
         summary.append(f"- 총 레코드 수: {data_info['총 레코드 수']}")
         summary.append(f"- 컬럼 목록: {', '.join(data_info['컬럼 목록'])}")
         summary.append("\n### 컬럼별 정보:")
         for col in df.columns:
             if df[col].dtype in ['int64', 'float64']:
                 stats = df[col].describe()
                 summary.append(f"\n{col} (수치형):")
                 summary.append(f"- 평균: {stats['mean']:.2f}")
                 summary.append(f"- 최소: {stats['min']}")
                 summary.append(f"- 최대: {stats['max']}")
             elif df[col].dtype == 'object' or df[col].dtype == 'string':
                 unique_count = df[col].nunique()
                 summary.append(f"\n{col} (텍스트):")
                 summary.append(f"- 고유값 수: {unique_count}")
+                if unique_count < 10:
                     value_counts = df[col].value_counts().head(5)
                     summary.append("- 상위 5개 값:")
                     for val, count in value_counts.items():
                         summary.append(f"  • {val}: {count}개")
         preview = df.head(10).to_markdown(index=False)
         summary.append("\n### 데이터 미리보기:")
         summary.append(preview)
         parquet_content = "\n".join(summary)
         parquet_json = df.to_json(orient='records', force_ascii=False)
         return "Parquet 파일이 성공적으로 업로드되었습니다.", parquet_content, parquet_json
     except Exception as e:
         return f"Parquet 파일 업로드 중 오류가 발생했습니다: {str(e)}", "", ""
 def text_to_parquet(text: str) -> Tuple[str, str, str]:
     try:
         lines = [line.strip() for line in text.split('\n') if line.strip()]
         data = []
         for line in lines:
             try:
                 import re
                 pattern = r'(\d+),([^,]+),([^,]+),(.+)'
                 match = re.match(pattern, line)
                 if match:
                     id_val, text_val, label_val, metadata_val = match.groups()
                     text_val = text_val.strip().strip('"')
                     label_val = label_val.strip().strip('"')
                     metadata_val = metadata_val.strip().strip('"')
         if not data:
             return "변환할 데이터가 없습니다.", "", ""
         df = pd.DataFrame(data)
         df = df.astype({
             'id': 'int32',
             'text': 'string',
             'metadata': 'string'
         })
         parquet_filename = 'text_to_parquet.parquet'
         df.to_parquet(parquet_filename, engine='pyarrow', compression='snappy')
         preview = df.to_markdown(index=False)
         return (
         print(f"{error_message}\n{traceback.format_exc()}")
         return error_message, "", ""
+def respond(message: str, history: List[Dict[str, str]], system_message: str = "", max_tokens: int = 4000, temperature: float = 0.5, top_p: float = 0.9, parquet_data: str = None, api_key: str = None) -> str:
+    if not api_key:
+        yield "⚠️ API Key가 설정되지 않았습니다. 서비스 이용을 위해 API Key를 입력해주세요."
+        return
+    # OpenAI 클라이언트 초기화
+    client = OpenAI(api_key=api_key)
+    system_prefix = """반드시 한글로 답변할 것. 너는 업로드된 데이터를 기반으로 질문에 답변하는 역할을 한다.
+주요 지침:
+1. 질문과 직접 관련된 내용만 간단명료하게 답변할 것
+2. 이전 답변과 중복되는 내용은 제외할 것
+3. 불필요한 예시나 부연 설명은 하지 말 것
+4. 동일한 내용을 다른 표현으로 반복하지 말 것
+5. 핵심 정보만 전달할 것
+"""
+    if parquet_data:
+        try:
+            df = pd.read_json(io.StringIO(parquet_data))
+            data_summary = df.describe(include='all').to_string()
+            system_prefix += f"\n\n데이터 요약:\n{data_summary}"
+        except Exception as e:
+            print(f"데이터 로드 오류: {str(e)}")
+    messages = [{"role": "system", "content": system_prefix}]
+    recent_history = history[-3:] if history else []
+    for chat in recent_history:
+        messages.append({"role": chat["role"], "content": chat["content"]})
+    messages.append({"role": "user", "content": message})
     try:
         response = client.chat.completions.create(
             model="gpt-4-0125-preview",
+            messages=messages,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
             stream=True
         )
         for chunk in response:
             if chunk.choices[0].delta.content:
                 full_response += chunk.choices[0].delta.content
+                yield clean_response(full_response)
     except Exception as e:
+        error_message = f"응답 생성 중 오류 발생: {str(e)}"
+        print(f"{error_message}\n{traceback.format_exc()}")
+        yield error_message
+def preprocess_text_with_llm(input_text: str, api_key: str = None) -> str:
+    if not api_key:
+        return "⚠️ API Key가 설정되지 않았습니다. 서비스 이용을 위해 API Key를 입력해주세요."
+    # OpenAI 클라이언트 초기화
+    client = OpenAI(api_key=api_key)
     system_prompt = """반드시 한글(한국어)로 답변하시오. 당신은 데이터 전처리 전문가입니다. 입력된 텍스트를 CSV 데이터셋 형식으로 변환하세요.
     try:
         response = client.chat.completions.create(
+            model="gpt-4-0125-preview",
             messages=[
                 {"role": "system", "content": system_prompt},
                 {"role": "user", "content": input_text}
             if chunk.choices[0].delta.content:
                 full_response += chunk.choices[0].delta.content
         processed_text = clean_response(full_response)
         try:
             from io import StringIO
             import csv
         print(error_message)
         return error_message
 # Gradio Blocks 인터페이스 설정
 with gr.Blocks(css=css) as demo:
+    api_key_state = gr.State("")  # API 키를 저장할 State 추가
     gr.Markdown("# MyEzRAG: LLM이 나만의 데이터로 학습한 콘텐츠 생성/답변", elem_id="initial-description")
+    # API 키 입력 섹션 추가
+    with gr.Row(elem_classes="api-key-section"):
+        with gr.Column(scale=3):
+            api_key_input = gr.Textbox(
+                label="OpenAI API Key",
+                placeholder="sk-...",
+                type="password",
+                show_label=True
+            )
+        with gr.Column(scale=1):
+            api_key_button = gr.Button("API Key 설정", variant="primary")
+    # API 키 상태 표시
+    api_key_status = gr.Markdown("⚠️ API Key가 설정되지 않았습니다. 서비스 이용을 위해 API Key를 입력해주세요.", elem_classes="api-key-status")
+    # API 키 설정 함수
+    def set_api_key(api_key: str):
+        if not api_key.strip():
+            return "⚠️ API Key가 설정되지 않았습니다. 서비스 이용을 위해 API Key를 입력해주세요.", ""
+        if not api_key.startswith("sk-"):
+            return "❌ 올바르지 않은 API Key 형���입니다. 다시 확인해주세요.", ""
+        return "✅ API Key가 성공적으로 설정되었습니다.", api_key
+    # API 키 설정 이벤트 연결
+    api_key_button.click(
+        set_api_key,
+        inputs=[api_key_input],
+        outputs=[api_key_status, api_key_state]
+    )
     gr.Markdown(
         "### '사용 방법' 탭을 통해 자세한 이용 방법을 참고하세요.\n"
         "### Tip) '예제'를 통해 다양한 활용 방법을 체험하고 응용해 보세요, 데이터셋 업로드시 미리보기는 10건만 출력",
         elem_id="initial-description"
     )
+    # 첫 번째 탭: My 데이터셋+LLM
     with gr.Tab("My 데이터셋+LLM"):
         gr.Markdown("### LLM과 대화하기")
         chatbot_data_upload = gr.Chatbot(label="챗봇", type="messages", elem_id="chatbot-data-upload")
         parquet_data_state = gr.State()
+        def handle_message_data_upload(message: str, history: List[Dict[str, str]], system_message: str, max_tokens: int, temperature: float, top_p: float, parquet_data: str, api_key: str):
+            if not api_key:
+                history = history or []
+                history.append({"role": "assistant", "content": "⚠️ API Key가 설정되지 않았습니다. 서비스 이용을 위해 API Key를 입력해주세요."})
+                yield history, ""
+                return
             history = history or []
             recent_questions = [chat['content'].strip().lower() for chat in history[-3:] if chat['role'] == 'user']
             if message.strip().lower() in recent_questions:
                 yield history + [{"role": "assistant", "content": "동일한 질문이 최근에 있었습니다. 다른 질문을 해주세요."}], ""
                     history,
                     system_message,
                     max_tokens,
+                    temperature=0.3,
                     top_p=top_p,
+                    parquet_data=parquet_data,
+                    api_key=api_key
                 )
                 partial_response = ""
                 history.append({"role": "assistant", "content": response})
                 yield history, ""
         send_data_upload.click(
             handle_message_data_upload,
             inputs=[
                 max_tokens,
                 temperature,
                 top_p,
+                parquet_data_state,
+                api_key_state,
             ],
             outputs=[chatbot_data_upload, msg_data_upload],
             queue=True
         )
+# 예제 추가
         with gr.Accordion("예제", open=False):
             gr.Examples(
                 examples=[
                 label="예제 선택",
             )
+        # Parquet 파일 업로드
         gr.Markdown("### Parquet 파일 업로드")
         with gr.Row():
             with gr.Column():
                     outputs=[parquet_upload_status, parquet_preview_chat, parquet_data_state]
                 )
+    # 두 번째 탭: CSV to My 데이터셋
     with gr.Tab("CSV to My 데이터셋"):
         gr.Markdown("### CSV 파일 업로드 및 Parquet 변환")
         with gr.Row():
                     outputs=[upload_status, parquet_preview, download_button]
                 )
+    # 세 번째 탭: Text to My 데이터셋
     with gr.Tab("Text to My 데이터셋"):
         gr.Markdown("### 텍스트를 입력하면 CSV로 변환 후 Parquet으로 자동 전환됩니다.")
         with gr.Row():
                     outputs=[convert_status, parquet_preview_convert, download_parquet_convert]
                 )
+    # 네 번째 탭: Text Preprocessing with LLM
     with gr.Tab("Text Preprocessing with LLM"):
         gr.Markdown("### 텍스트를 입력하면 LLM이 데이터셋 형식에 맞게 전처리하여 출력합니다.")
         with gr.Row():
                     interactive=False
                 )
                 convert_to_parquet_button = gr.Button("Parquet으로 변환")
                 download_parquet = gr.File(label="변환된 Parquet 파일 다운로드")
+                def handle_text_preprocessing(input_text: str, api_key: str):
+                    if not api_key:
+                        yield "⚠️ API Key가 설정되지 않았습니다.", ""
+                        return
                     if not input_text.strip():
+                        yield "입력 텍스트가 없습니다.", ""
+                        return
                     try:
+                        yield "전처리를 시작합니다...", ""
+                        processed_text = preprocess_text_with_llm(input_text, api_key)
                         if processed_text:
+                            yield "전처리가 완료되었습니다.", processed_text
                         else:
+                            yield "전처리 결과가 없습니다.", ""
                     except Exception as e:
+                        yield f"처리 중 오류가 발생했습니다: {str(e)}", ""
                 def clear_inputs():
                     return "", "대기 중...", ""
                     except Exception as e:
                         return f"Parquet 변환 중 오류 발생: {str(e)}", None
                 preprocess_button.click(
                     handle_text_preprocessing,
+                    inputs=[raw_text_input, api_key_state],
                     outputs=[preprocess_status, processed_text_output],
                     queue=True
                 )
                     outputs=[preprocess_status, download_parquet]
                 )
                 with gr.Accordion("예제 텍스트", open=False):
                     gr.Examples(
                         examples=[
                         label="예제 선택"
                     )
+# 사용 방법 탭
     with gr.Tab("📚 사용 방법"):
         gr.Markdown("""
         # MyEzRAG 사용 가이드
+        ## 🔑 API Key 설정
+        1. OpenAI API Key를 상단 입력창에 입력
+        2. 'API Key 설정' 버튼 클릭
+        3. 설정 성공 메시지 확인
         ## 1️⃣ My 데이터셋+LLM 탭
         ### 기능
         - 업로드된 Parquet 데이터셋을 기반으로 LLM과 대화
         - 데이터셋의 내용을 활용한 콘텐츠 생성
         ---
         ## 2️⃣ CSV to My 데이터셋 탭
         ### 기능
         - CSV 파일을 Parquet 형식으로 변환
         - 데이터 최적화 및 정제
         ---
         ## 3️⃣ Text to My 데이터셋 탭
         ### 기능
         - 텍스트 형식의 데이터를 Parquet으로 변환
         - 수동 데이터 입력 지원
         ---
         ## 4️⃣ Text Preprocessing with LLM 탭
         ### 기능
         - LLM을 활용한 자동 텍스트 전처리
         - 구조화된 데이터셋 생성
         - 데이터 정규화
         ## 💡 일반적인 팁
+        - API Key는 안전하게 보관하고 주기적으로 갱신
         - 각 탭의 예제를 참고하여 사용법 ��히기
         - 데이터 품질이 좋을수록 더 나은 결과 제공
         - 오류 발생 시 입력 데이터 형식 확인
         - 대용량 처리 시 적절한 청크 크기로 분할 처리
         ## ⚠️ 주의사항
+        - API Key를 타인과 공유하지 않기
         - 민감한 개인정보 포함하지 않기
         - 데이터 백업 권장
         - 네트워크 상태 확인
         - 브라우저 캐시 주기적 정리
         ## 🔍 문제 해결
+        - API Key 오류: 키 형식 및 유효성 확인
         - 오류 발생 시 입력 데이터 형식 확인
         - 파일 업로드 실패 시 파일 크기 및 형식 확인
         - 변환 실패 시 데이터 인코딩 확인
         - 응답이 느릴 경우 데이터 크기 조정
         """)
     gr.Markdown("### [email protected]", elem_id="initial-description")
 if __name__ == "__main__":
+    demo.launch(share=True)