본문 바로가기

web crawling

Instagram Hashtag crawling using Selenium

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import pandas as pd
from selenium import webdriver
import numpy as np
import urllib
import selenium
from urllib.request import Request, urlopen
import requests
import time
from bs4 import BeautifulSoup
from urllib import request
 
 
search = input('검색어를 입력하시오: ')
search = urllib.parse.quote(search)
url = 'https://www.instagram.com/explore/tags/'+str(search)+'/'
 
 
 
driver.Chrome('C:\\Users\\user\\Desktop\\chromedriver\\chromedriver.exe')  #절대경로로 찾아가서 크롬드라이버로 검색
driver.get(url)
time.sleep(5#페이지가 완전히 로드된 후 코드 진행. 5초간 슬립
 
 
SCROLL_PAUSE_TIME = 1.5
reallink = [] #이곳에 각 페이지의 고유 주소들을 모은다.
 
while True:
    pagestring = driver.page_source
    bs = BeautifulSoup(pagestring, 'lxml')
    
    for link1 in bs.find_all(name = 'div', attrs={'class':'Nnq7C weEfm'}): # div태그의 'Nnq7C weEfm' 클래스의 'a'태그들을 가져옴 / find_all을 사용하면 하위태그들까지 전부 가져온다.
        title = link1.select('a')[0
        real = title.attrs['href']
        reallink.append(real)
        title = link1.select('a')[1]
        real = title.attrs['href']
        reallink.append(real)
        title = link1.select('a')[2]
        real = title.attrs['href']
        reallink.append(real)
        
    last_height = driver.execute_script('return document.body.scrollHeight'#scrollheight를 반환함
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);'# x축으로 0, y축으로 scrollheight만큼 이동시킴
    time.sleep(SCROLL_PAUSE_TIME) # 1.5초간 대기
    new_height = driver.execute_script('return document.body.scrollHeight'#이동 시킨 이후의 scrollheight를 가져옴
    if new_height == last_height: #이동시키기 전과 후가 동일하면 한번 더 스크롤다운을 해보고 기다린다. 
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script('return document.body.scrollHeight'
        if new_height == last_height:
            break #한번더 스크롤다운을 해도 동일하면 종료
        else:
            new_height = last_height
            continue
 
 
 
reallinknum = len(reallink)
csvtext = []
for i in range(0, reallinknum):
    csvtext.append([])
    try:
        req = Request('https://www.instagram.com' + reallink[i], headers={'User-Agent':'Mozilla/5.0'}) #인스타그램에서 봇인 것을 감지하고, user-agent를 확인하고 정보를 내어주지 않을 수 있다.이를 방지하기위해 header에 {'User-Agent' : '유저정보'} 를 기입한다.
    
    
        webpage = urlopen(req).read()
            
    except:
        print('error:', i)
        continue
 
   
    
    soup = BeautifulSoup(webpage, 'lxml', from_encoding = 'utf-8')
        
    for reallink2 in soup.find_all('meta', attrs={'property':'instapp:hashtags'}): #해시태그
        reallink2 = reallink2['content']
        csvtext[i].append(reallink2)
    
 
 
 
data = pd.DataFrame(csvtext) #dataframe
print(data)
cs