1
2
3
4
5
6
7
8
|
#Crawling test(html 구조 파악하기)
import requests
from bs4 import BeautifulSoup
test_url = "https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=136990&type=after&page=1"
resp = requests.get(test_url)
html = BeautifulSoup(resp.content, 'html.parser')
html
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
|
#영화 군도
gundo_url = 'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=99752&type=after&page='
gundo_page = 2000
gundo_texts = []
gundo_scores = []
def organize_data(x):
a = x.find('div', {'class':'score_result'})
lists = a.findAll('li')
for li in lists:
for i in range(gundo_page):
gundo_reviews = []
for text in gundo_texts:
if '관람객' == text[0:3]:
a = text[3:]
a = a.replace('\n','')
a = a.replace('\t','')
a = a.replace('\r','')
else:
num_scores = []
for i in gundo_scores:
num_scores.append(int(i))
#부정 = 0, 긍정 = 1
gundo_labels = []
for i in num_scores:
if i <= 7:
else:
#영화 인크레더블2
incre_url = 'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=136990&type=after&page='
incre_page = 715
incre_texts = []
incre_scores = []
def organize_data(x):
a = x.find('div', {'class':'score_result'})
lists = a.findAll('li')
for li in lists:
for i in range(incre_page):
html = BeautifulSoup(requested.content, 'html.parser')
organize_data(html)
incre_reviews = []
for text in incre_texts:
if '관람객' == text[0:3]:
b = text[3:]
b = b.replace('\n','')
b = b.replace('\t','')
b = b.replace('\r','')
else:
num_scores = []
for i in incre_scores:
num_scores.append(int(i))
incre_labels = []
for i in num_scores:
if i <= 7:
else:
#어차피 뒤에서 shuffle 해줄거니까 일단 그냥 붙이자
reviews = gundo_reviews + incre_reviews
labels = gundo_labels + incre_labels
import numpy as np
labels = np.array(labels)
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
data:image/s3,"s3://crabby-images/7c729/7c72917c5ec96aac3b00f35f349ba89889350f18" alt=""
data:image/s3,"s3://crabby-images/ea401/ea401b0008452ba1b9dfbee0a59b18f67208736b" alt=""
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
!pip install pytorch-transformers
!pip install torch==1.1.0
!pip install transformers==2.2.2
!pip install sentencepiece==0.1.82
import torch
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
review_train, review_test, label_train, label_test = train_test_split(reviews, labels, random_state=100, test_size=0.3, shuffle = True )
review_train_sent = ['[CLS] ' + str(review) + ' [SEP]' for review in review_train]
review_test_sent = ['[CLS] ' + str(review) + ' [SEP]' for review in review_test]
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)
tokenized_review_train = [tokenizer.tokenize(sent) for sent in review_train_sent]
tokenized_review_test = [tokenizer.tokenize(sent) for sent in review_test_sent]
train_review_token_id = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_review_train]
test_review_token_id = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_review_test]
max_len = 128
train_review_token_id = pad_sequences(train_review_token_id, maxlen=max_len, dtype='long', padding='post',truncating='post')
test_review_token_id = pad_sequences(test_review_token_id, maxlen=max_len, dtype='long', padding='post',truncating='post')
mask_train = []
attention_mask_train = []
for sent in train_review_token_id:
mask_train = [float(i>0) for i in sent]
mask_test = []
attention_mask_test = []
for sent in test_review_token_id:
mask_test = [float(i>0) for i in sent]
train_review_token_id = torch.tensor(train_review_token_id)
attention_mask_train = torch.tensor(attention_mask_train)
label_train = torch.tensor(label_train)
test_review_token_id = torch.tensor(test_review_token_id)
attention_mask_test = torch.tensor(attention_mask_test)
label_test = torch.tensor(label_test)
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs |
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
|
#batch size
batch_size = 32
#데이터들 묶어주고 데이터로더 형식으로 변환해주기
train_data = TensorDataset(train_review_token_id, attention_mask_train, label_train)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=batch_size, sampler=train_sampler)
test_data = TensorDataset(test_review_token_id, attention_mask_test, label_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
#GPU를 사용하기 위한 device 설정
#모델설정
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=2)
# 옵티마이저 설정
optimizer = AdamW(model.parameters(),
lr = 2e-5, # 학습률
eps = 1e-8 # 0으로 나누는 것을 방지하기 위한 epsilon 값
)
# 에폭수
epochs = 3
# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs
# 정확도 계산 함수
def flat_accuracy(preds, labels):
labels_flat = labels.flatten()
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
|
# training 에폭만큼 반복
for epoch_i in range(0, epochs):
# ========================================m
# Training
# ========================================
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
print('Training...')
# 로스 초기화
total_loss = 0
# 훈련모드로 변경
# 데이터로더에서 배치만큼 반복하여 가져옴
for step, batch in enumerate(train_dataloader):
# 배치를 GPU에 넣음
batch = tuple(t.to(device) for t in batch)
# 배치에서 데이터 추출
b_input_ids, b_input_mask, b_labels = batch
# Forward 수행
outputs = model(b_input_ids,
token_type_ids=None,
attention_mask=b_input_mask,
labels = b_labels
)
# 로스 구함
loss = outputs[0]
# 총 로스 계산
# Backward 수행으로 그래디언트 계산
loss.backward()
# 그래디언트를 통해 가중치 파라미터 업데이트
# 그래디언트 초기화
model.zero_grad() #we need to set the gradients to zero before starting to do backpropragation because PyTorch accumulates the gradients on subsequent backward passes
# 평균 로스 계산
avg_train_loss = total_loss / len(train_dataloader)
print(avg_train_loss)
# ========================================
# test
# ========================================
print("")
print("Running test...")
accuracies = []
predictions = []
# 평가모드로 변경
# 변수 초기화
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
# 데이터로더에서 배치만큼 반복하여 가져옴
for batch in test_dataloader:
# 배치를 GPU에 넣음
batch = tuple(t.to(device) for t in batch)
# 배치에서 데이터 추출
b_input_ids, b_input_mask, b_labels = batch
# 그래디언트 계산 안함
with torch.no_grad():
# Forward 수행
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
# 로짓 구함
logits = outputs[0]
# CPU로 데이터 이동
logits = logits.detach().cpu().numpy()
# 출력 로짓과 라벨을 비교하여 정확도 계산
tmp_eval_accuracy = flat_accuracy(logits, label_ids)
eval_accuracy += tmp_eval_accuracy
nb_eval_steps += 1
print('Accuracy: {0:.5f}'.format(eval_accuracy/nb_eval_steps))
accuracies.append(eval_accuracy/nb_eval_steps)
print("complete!")
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
http://colorscripter.com/info#e" target="_blank" style="text-decoration:none;color:white">cs |
eval_accuracy = 84%
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
|
#새로운 영화 테스트
def convert_input_data(sentences):
tokenized_text = [tokenizer.tokenize(sent) for sent in sentences]
Max_len = 128
input_ids = [tokenizer.convert_tokens_to_ids(sent) for sent in tokenized_text]
input_ids = pad_sequences(input_ids, maxlen = Max_len, padding = 'post', truncating='post',dtype = 'long')
attention_mask = []
for seq in input_ids:
seq_mask = [float(i>0) for i in seq]
inputs = torch.tensor(input_ids)
masks = torch.tensor(attention_mask)
return inputs, masks
def test_sentences(sentences):
inputs, masks = convert_input_data(sentences)
b_input_ids = inputs.to(device)
b_input_masks = masks.to(device)
with torch.no_grad():
outputs = model(b_input_ids,
token_type_ids = None,
attention_mask = b_input_masks)
logits = outputs[0]
logits = logits.detach().cpu().numpy()
return logits
http://colorscripter.com/info#e" target="_blank" style="color:#e5e5e5text-decoration:none">Colored by Color Scripter
|
'deeplearning' 카테고리의 다른 글
CRF를 이용한 Named Entity Recognition (0) | 2020.09.18 |
---|---|
Explainable AI - Integrated Gradients (IG) (0) | 2020.08.26 |
Optimization - Momentum, RMSProp, Adam (0) | 2020.08.21 |
Graph Convolutional Neural Networks (GCN) (0) | 2020.08.11 |
Gensim Word2Vec Fine-tuning (1) | 2020.06.22 |