-
Notifications
You must be signed in to change notification settings - Fork 0
/
proyek_nlp_dicoding_1.py
126 lines (101 loc) · 3.8 KB
/
proyek_nlp_dicoding_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# -*- coding: utf-8 -*-
"""proyek nlp dicoding_1.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1eWcv_8YbRdZ6k5jW-v1ATioHpMXFQamv
Nama : Dini Ramadhani Salsabila
Umur = 21 tahun
E-mail = [email protected]
Asal kampus = Universitas Andalas
Jurusan = Teknik Komputer
sumber dataset : https://www.kaggle.com/kishanyadav/inshort-news
"""
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import seaborn as sns
import io
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Flatten, Dropout, Dense
sns.set()
# %matplotlib inline
from google.colab import files
import warnings
warnings.filterwarnings('ignore')
files = files.upload()
berita1 = pd.read_csv(io.BytesIO(files['inshort_news_data-1.csv']))
berita2 = pd.read_csv(io.BytesIO(files['inshort_news_data-2.csv']))
berita3 = pd.read_csv(io.BytesIO(files['inshort_news_data-3.csv']))
berita4 = pd.read_csv(io.BytesIO(files['inshort_news_data-4.csv']))
berita5 = pd.read_csv(io.BytesIO(files['inshort_news_data-5.csv']))
berita6 = pd.read_csv(io.BytesIO(files['inshort_news_data-6.csv']))
berita7 = pd.read_csv(io.BytesIO(files['inshort_news_data-7.csv']))
data = [berita1, berita2, berita3, berita4, berita5, berita6, berita7]
df = pd.concat(data, axis=0, ignore_index=True)
df.head()
df.shape
df = df.drop(['Unnamed: 0', 'news_headline'], axis=1)
df.head()
"""
Data Visualization"""
plt.figure(figsize=(10,5))
sns.countplot(df['news_category'])
plt.xticks(rotation=45)
plt.show()
review = pd.get_dummies(df['news_category'])
df2 = pd.concat([df, review], axis=1)
df2 = df2.drop('news_category', axis=1)
df2.head()
"""Splitting, Tokenizing, dan Padding"""
X = df2['news_article'].values
y = df2.drop('news_article', axis=1).values
#split jadi train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
tokenizer = Tokenizer(num_words=5000, oov_token='n')
tokenizer.fit_on_texts(X_train)
tokenizer.fit_on_texts(X_test)
sequence_train = tokenizer.texts_to_sequences(X_train)
sequence_test = tokenizer.texts_to_sequences(X_test)
padded_train = pad_sequences(sequence_train)
padded_test = pad_sequences(sequence_test)
"""model"""
class myCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if(logs.get('val_accuracy') > 0.90):
print('\nakurasi telah mencapai 90%')
self.model.stop_training = True
callbacks = myCallback()
model = Sequential([
Embedding(input_dim=10000, output_dim=128),
LSTM(128),
Flatten(),
Dropout(0.5),
Dense(128, activation='relu'),
Dense(64, activation='relu'),
Dense(7, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(padded_train, y_train, epochs=62, validation_data=(padded_test, y_test), callbacks=[callbacks], batch_size=184)
"""plot hasil model"""
plt.figure(figsize=(8,5))
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='validation_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.ylim(ymin=0, ymax=1)
plt.show()
plt.figure(figsize=(8,5))
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='validation_loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.ylim(ymin=0)
plt.show()