-
Notifications
You must be signed in to change notification settings - Fork 0
/
txt_to_csv.py
62 lines (49 loc) · 1.91 KB
/
txt_to_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import pandas as pd
# detaset path
root_dir = 'text'
# Initialize
title_list = []
content_list = []
label_list = []
# Traverse all category folders
for category in os.listdir(root_dir):
category_path = os.path.join(root_dir, category)
# Ensure it is a directory
if os.path.isdir(category_path):
# Traverse all files in the category folder
for file_name in os.listdir(category_path):
file_path = os.path.join(category_path, file_name)
# Read the content of each file
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# Check if the file has enough lines
if len(lines) >= 4:
title = lines[2].strip() # Get the title
# Get the content, starting from the fourth line, merge all remaining lines
content = ''.join(lines[3:]).strip()
# Store in lists
title_list.append(title)
content_list.append(content)
label_list.append(category)
# Create DataFrame
df = pd.DataFrame({
'label': label_list,
'title': title_list,
'content': content_list
})
# Display the first few rows of data
print(df.head())
# Save as CSV file
df.to_csv('processed_data.csv', index=False, encoding='utf-8')
# the dataset is from here: https://www.rondhuit.com/download.html
# 記事ファイルは以下のフォーマットにしたがって作成されています:
# the format of the article file is as follows:
# 1行目:記事のURL
# the first line: URL of the article
# 2行目:記事の日付
# the second line: date of the article
# 3行目:記事のタイトル
# the third line: title of the article
# 4行目以降:記事の本文
# from the fourth line: the body of the article