Parrotalk · Goldchae · Oct 24, 2024 · Oct 25, 2024 · Oct 29, 2024 · Dec 1, 2024
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitattributes b/.gitattributes
@@ -0,0 +1,2 @@
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
diff --git a/.github/.DS_Store b/.github/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.parrotalk
+<<<<<<< HEAD
+secret.py
+=======
+secret.pyresult/
+results/
+>>>>>>> origin/newBranch
diff --git a/__pycache__/.DS_Store b/__pycache__/.DS_Store
diff --git a/__pycache__/secret.cpython-312.pyc b/__pycache__/secret.cpython-312.pyc
diff --git a/aiModel/__init__.py b/aiModel/__init__.py
diff --git a/aiModel/__pycache__/__init__.cpython-312.pyc b/aiModel/__pycache__/__init__.cpython-312.pyc
diff --git a/aiModel/__pycache__/llm.cpython-312.pyc b/aiModel/__pycache__/llm.cpython-312.pyc
diff --git a/aiModel/embeddings.py b/aiModel/embeddings.py
@@ -0,0 +1,4 @@
+from langchain_openai import OpenAIEmbeddings
+import secret
+
+embedding_model = OpenAIEmbeddings(openai_api_key = secret.openai_api_key)
diff --git a/aiModel/llm.py b/aiModel/llm.py
@@ -0,0 +1,9 @@
+from langchain_openai import ChatOpenAI
+import secret
+
+AI_model = ChatOpenAI(
+    model="gpt-4o-mini",
+    temperature=0.75, 
+    max_tokens = 1024,
+    openai_api_key = secret.openai_api_key
+)
diff --git a/data/.DS_Store b/data/.DS_Store
diff --git a/data/Sample/.DS_Store b/data/Sample/.DS_Store
diff --git a/fineTuning/.DS_Store b/fineTuning/.DS_Store
diff --git a/fineTuning/GPU_check.py b/fineTuning/GPU_check.py
@@ -0,0 +1,3 @@
+import torch
+print(torch.cuda.is_available())  # True가 출력되면 GPU 사용 가능
+print(torch.cuda.get_device_name(0))  # GPU 이름 출력
diff --git a/fineTuning/baseModel.py b/fineTuning/baseModel.py
@@ -0,0 +1,25 @@
+# 베이스 모델 테스트
+from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+import torch
+
+# 모델과 토크나이저 불러오기
+model_name = "timpal0l/mdeberta-v3-base-squad2"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForQuestionAnswering.from_pretrained(model_name)
+
+# 질문과 문서 설정
+question = "둘 중에 무엇으로 할래요?"
+context = "남자는 여자에게 내일 메뉴에 대해 물었고 여자는 치킨이랑 피자 중에서 고르라고 했다."
+
+# 입력 데이터 토큰화
+inputs = tokenizer(question, context, return_tensors="pt")
+
+# 모델로부터 예측 결과 받기
+outputs = model(**inputs)
+answer_start_index = torch.argmax(outputs.start_logits)
+answer_end_index = torch.argmax(outputs.end_logits) + 1
+
+# 예측된 답변 토큰을 문자열로 변환
+answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))
+
+print(f"Answer: {answer}")
diff --git a/fineTuning/data/.DS_Store b/fineTuning/data/.DS_Store
diff --git a/fineTuning/data/1.Training/.DS_Store b/fineTuning/data/1.Training/.DS_Store
diff --git a/fineTuning/data/1.Training/clean.py b/fineTuning/data/1.Training/clean.py
@@ -0,0 +1,58 @@
+import json
+
+merged_file_path = "fineTuning/data/1.Training/merged_train_data.json"
+
+# JSON 데이터 정리 함수
+def clean_json_data(file_path):
+    try:
+        with open(file_path, "r", encoding="utf-8") as file:
+            data = json.load(file)
+    except json.JSONDecodeError as e:
+        print(f"JSONDecodeError: {e}")
+        return None
+    except Exception as e:
+        print(f"Unexpected error: {e}")
+        return None
+
+    cleaned_data = []
+    for i, entry in enumerate(data):
+        if not entry:  # entry가 None인 경우 제외
+            print(f"Warning: Entry {i} is None. Skipping...")
+            continue
+
+        # 필수 필드 확인 및 기본 값 추가
+        if "context" not in entry or not entry["context"]:
+            print(f"Warning: Entry {i} missing 'context'. Skipping...")
+            continue  # context가 없는 데이터는 제외
+        if "question" not in entry or not entry["question"]:
+            print(f"Warning: Entry {i} missing 'question'. Skipping...")
+            continue  # question이 없는 데이터는 제외
+        if "answers" not in entry or not isinstance(entry["answers"], dict):
+            print(f"Warning: Entry {i} missing or invalid 'answers'. Setting default values...")
+            entry["answers"] = {"text": [""], "answer_start": [0]}  # 기본 값 추가
+        elif not entry["answers"]["text"]:  # answers["text"]가 비어 있는 경우
+            print(f"Warning: Entry {i} has empty 'answers'. Setting default values...")
+            entry["answers"]["text"] = [""]
+            entry["answers"]["answer_start"] = [0]
+
+        # 정리된 데이터를 추가
+        cleaned_data.append(entry)
+
+    # 정리된 데이터를 새로운 JSON 파일로 저장
+    cleaned_file_path = "fineTuning/data/1.Training/cleaned_train_data.json"
+    try:
+        with open(cleaned_file_path, "w", encoding="utf-8") as cleaned_file:
+            json.dump(cleaned_data, cleaned_file, ensure_ascii=False, indent=4)
+        print(f"Cleaned data saved to: {cleaned_file_path}")
+    except Exception as e:
+        print(f"Error saving cleaned data: {e}")
+        return None
+
+    return cleaned_file_path
+
+# JSON 데이터 정리 수행
+cleaned_file_path = clean_json_data(merged_file_path)
+if cleaned_file_path:
+    print(f"Cleaned file path: {cleaned_file_path}")
+else:
+    print("Failed to clean and save JSON data.")
diff --git a/fineTuning/data/1.Training/cleaned_train_data.json b/fineTuning/data/1.Training/cleaned_train_data.json
diff --git a/fineTuning/data/1.Training/dataMerge.py b/fineTuning/data/1.Training/dataMerge.py
@@ -0,0 +1,18 @@
+import os
+import json
+
+folder_path = "fineTuning/data/1.Training/labeled_data" 
+
+merged_data = []
+
+for file_name in os.listdir(folder_path):
+    if file_name.endswith(".json"): 
+        file_path = os.path.join(folder_path, file_name)
+        with open(file_path, "r", encoding="utf-8") as file:
+            data = json.load(file)
+            merged_data.extend(data)  
+
+output_file = "merged_train_data.json"
+with open(output_file, "w", encoding="utf-8") as file:
+    json.dump(merged_data, file, ensure_ascii=False, indent=4)
+
diff --git a/fineTuning/data/1.Training/labeled_data/.DS_Store b/fineTuning/data/1.Training/labeled_data/.DS_Store
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		*.safetensors filter=lfs diff=lfs merge=lfs -text
		*.pt filter=lfs diff=lfs merge=lfs -text