본문 바로가기

web crawling

instagram images crawling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from selenium import webdriver
import numpy as np
import urllib
import selenium
from urllib.request import Request, urlopen
import requests
import time
from bs4 import BeautifulSoup
from urllib import request
import pandas as pd
 
 
search = input('검색어를 입력하시오: ')
search = urllib.parse.quote(search)
url = 'https://www.instagram.com/explore/tags/'+str(search) + '/'
 
 
driver = webdriver.Chrome('C:\\Users\\user\\Desktop\\chromedriver\\chromedriver.exe')  #절대경로로 찾아가서 크롬드라이버로 검색
driver.get(url)
time.sleep(5#페이지가 완전히 로드된 후 코드 진행. 5초간 슬립
 
SCROLL_PAUSE_TIME = 2
sources = [] #이곳에 각 이미지의 주소들을 모은다.
 
while True:
  
    
    images=driver.find_elements_by_class_name('FFVAD'#이미지들의 class name
   
    for image in images:
        try:
            source = image.get_attribute('src')
            sources.append(source) #이미지 소스들 
        except:
            print('Null')
        
       
        
    last_height = driver.execute_script('return document.body.scrollHeight'#scrollheight를 반환함
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight);'# x축으로 0, y축으로 scrollheight만큼 이동시킴
    time.sleep(SCROLL_PAUSE_TIME) # 대기
    new_height = driver.execute_script('return document.body.scrollHeight'#이동 시킨 이후의 scrollheight를 가져옴
    if new_height == last_height: #이동시키기 전과 후가 동일하면 한번 더 스크롤다운을 해보고 기다린다. 
        driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(SCROLL_PAUSE_TIME)
        new_height = driver.execute_script('return document.body.scrollHeight'
        if new_height == last_height:
            break #한번더 스크롤다운을 해도 동일하면 종료
        else:
            new_height = last_height
            continue
  
    
#중복 제거
sources = set(sources)
 
#사진 저장
for i, source in enumerate(sources):
    urllib.request.urlretrieve(source, "picture_" + str(i) + '.png')


# urlretrieve(url, path + filename + str(i) + '.png')
# URL로 표시된 네트워크 객체(URL 주소의 문서)를 로컬 파일로 저장, 즉 직접 다운로드가 가능
cs