Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨Feat : 파인튜닝용 데이터셋, 라벨링 코드 #13

Open
wants to merge 6 commits into
base: feature/#12
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
The diff you're trying to view is too large. We only load the first 3000 changed files.
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
*.safetensors filter=lfs diff=lfs merge=lfs -text
*.pt filter=lfs diff=lfs merge=lfs -text
Binary file added .github/.DS_Store
Binary file not shown.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
.parrotalk
<<<<<<< HEAD
secret.py
=======
secret.pyresult/
results/
>>>>>>> origin/newBranch
Binary file added __pycache__/.DS_Store
Binary file not shown.
Binary file added __pycache__/secret.cpython-312.pyc
Binary file not shown.
Empty file added aiModel/__init__.py
Empty file.
Binary file added aiModel/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file added aiModel/__pycache__/llm.cpython-312.pyc
Binary file not shown.
4 changes: 4 additions & 0 deletions aiModel/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from langchain_openai import OpenAIEmbeddings
import secret

embedding_model = OpenAIEmbeddings(openai_api_key = secret.openai_api_key)
9 changes: 9 additions & 0 deletions aiModel/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from langchain_openai import ChatOpenAI
import secret

AI_model = ChatOpenAI(
model="gpt-4o-mini",
temperature=0.75,
max_tokens = 1024,
openai_api_key = secret.openai_api_key
)
Binary file added data/.DS_Store
Binary file not shown.
Binary file added data/Sample/.DS_Store
Binary file not shown.
Binary file added fineTuning/.DS_Store
Binary file not shown.
3 changes: 3 additions & 0 deletions fineTuning/GPU_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import torch
print(torch.cuda.is_available()) # True가 출력되면 GPU 사용 가능
print(torch.cuda.get_device_name(0)) # GPU 이름 출력
25 changes: 25 additions & 0 deletions fineTuning/baseModel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 베이스 모델 테스트
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# 모델과 토크나이저 불러오기
model_name = "timpal0l/mdeberta-v3-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# 질문과 문서 설정
question = "둘 중에 무엇으로 할래요?"
context = "남자는 여자에게 내일 메뉴에 대해 물었고 여자는 치킨이랑 피자 중에서 고르라고 했다."

# 입력 데이터 토큰화
inputs = tokenizer(question, context, return_tensors="pt")

# 모델로부터 예측 결과 받기
outputs = model(**inputs)
answer_start_index = torch.argmax(outputs.start_logits)
answer_end_index = torch.argmax(outputs.end_logits) + 1

# 예측된 답변 토큰을 문자열로 변환
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][answer_start_index:answer_end_index]))

print(f"Answer: {answer}")
Binary file added fineTuning/data/.DS_Store
Binary file not shown.
Binary file added fineTuning/data/1.Training/.DS_Store
Binary file not shown.
58 changes: 58 additions & 0 deletions fineTuning/data/1.Training/clean.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json

merged_file_path = "fineTuning/data/1.Training/merged_train_data.json"

# JSON 데이터 정리 함수
def clean_json_data(file_path):
try:
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
except json.JSONDecodeError as e:
print(f"JSONDecodeError: {e}")
return None
except Exception as e:
print(f"Unexpected error: {e}")
return None

cleaned_data = []
for i, entry in enumerate(data):
if not entry: # entry가 None인 경우 제외
print(f"Warning: Entry {i} is None. Skipping...")
continue

# 필수 필드 확인 및 기본 값 추가
if "context" not in entry or not entry["context"]:
print(f"Warning: Entry {i} missing 'context'. Skipping...")
continue # context가 없는 데이터는 제외
if "question" not in entry or not entry["question"]:
print(f"Warning: Entry {i} missing 'question'. Skipping...")
continue # question이 없는 데이터는 제외
if "answers" not in entry or not isinstance(entry["answers"], dict):
print(f"Warning: Entry {i} missing or invalid 'answers'. Setting default values...")
entry["answers"] = {"text": [""], "answer_start": [0]} # 기본 값 추가
elif not entry["answers"]["text"]: # answers["text"]가 비어 있는 경우
print(f"Warning: Entry {i} has empty 'answers'. Setting default values...")
entry["answers"]["text"] = [""]
entry["answers"]["answer_start"] = [0]

# 정리된 데이터를 추가
cleaned_data.append(entry)

# 정리된 데이터를 새로운 JSON 파일로 저장
cleaned_file_path = "fineTuning/data/1.Training/cleaned_train_data.json"
try:
with open(cleaned_file_path, "w", encoding="utf-8") as cleaned_file:
json.dump(cleaned_data, cleaned_file, ensure_ascii=False, indent=4)
print(f"Cleaned data saved to: {cleaned_file_path}")
except Exception as e:
print(f"Error saving cleaned data: {e}")
return None

return cleaned_file_path

# JSON 데이터 정리 수행
cleaned_file_path = clean_json_data(merged_file_path)
if cleaned_file_path:
print(f"Cleaned file path: {cleaned_file_path}")
else:
print("Failed to clean and save JSON data.")
408,638 changes: 408,638 additions & 0 deletions fineTuning/data/1.Training/cleaned_train_data.json

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions fineTuning/data/1.Training/dataMerge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
import os
import json

folder_path = "fineTuning/data/1.Training/labeled_data"

merged_data = []

for file_name in os.listdir(folder_path):
if file_name.endswith(".json"):
file_path = os.path.join(folder_path, file_name)
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
merged_data.extend(data)

output_file = "merged_train_data.json"
with open(output_file, "w", encoding="utf-8") as file:
json.dump(merged_data, file, ensure_ascii=False, indent=4)

Binary file not shown.
Loading