웹스크래핑¶
In [92]:
from selenium import webdriver
from bs4 import BeautifulSoup
import re
def cleanText(readData):
#텍스트에 포함되어 있는 특수 문자 제거
text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', '', readData)
return text
driver = webdriver.Chrome('C:/Users/User/Downloads/chromedriver')
#웹 자원 로드를 위해 3초 대기
driver.implicitly_wait(3)
#url접근
keyword = "주식"
URL = 'https://search.naver.com/search.naver?where=news&sm=tab_jum&query='
driver.get('https://search.naver.com/search.naver?where=news&sm=tab_jum&query='+keyword)
#url=driver.find_element_by_css_selector('a._sp_each_title').text
title = driver.find_element_by_css_selector('a._sp_each_title').text
print(title)
print(URL)
In [93]:
#print(driver)
import urllib.request
selected_tag_a = driver.find_element_by_tag_name('a')
selected_link = driver.find_elements_by_partial_link_text('')
soup = BeautifulSoup(driver.page_source, 'lxml')
resp = urllib.request.urlopen(URL)
soup1 = BeautifulSoup(resp,from_encoding=resp.info().get_param('charset'), features='html.parser')
items = soup.select('a._sp_each_title')
d1 = soup.select('dt')
url = soup.select('a._sp_each_url')
source_time = soup.select('dd.txt_inline')
def rr(data):
result = re.sub("<.+?>", "", str(data))
return result
#OMG = re.sub("<.+?>", "", str(items))
OMG = rr(url)
OMG2 = rr(source_time)
print(OMG2)
print(OMG)
test1 = soup1.find_all('a', herf=True)
#import pandas as pd
#data = {'name' : OMG, 'time_souce' : OMG2}
#pd.DataFrame(data)
print(test1)
print(resp)
In [133]:
import urllib.request
from bs4 import BeautifulSoup
url = "https://search.naver.com/search.naver?where=news&sm=tab_jum&query=%EC%A3%BC%EC%8B%9D"
req = urllib.request.Request(url)
sourcecode = urllib.request.urlopen(url).read()
soup = BeautifulSoup(sourcecode, "html.parser")
#aa = soup.find("div", class_ = "news mynews section _prs_nws").find_all("li")
#print(aa[3].find("a")["href"])
aaa = [0]
#for href in soup.find("div", class_ = "news mynews section _prs_nws").find_all("li"):
#print(href.get('href'))
for href in soup.find_all("a"):
print(href.get('href'))