반응형
파이썬 다음 뉴스 웹 크롤링하기
import requests
import lxml.html
REG_DATE = '20200819'
response = requests.get('https://news.daum.net/breakingnews/digital?regDate={}'.format(REG_DATE))
root = lxml.html.fromstring(response.content)
root.make_links_absolute(response.url)
for a in root.cssselect('.box_etc .tit_thumb a') :
print(a.text)
import requests
import lxml.html
import pandas as pd
import sqlite3
from pandas.io import sql
import os
import re
import string
def get_detail(url):
body = []
punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·]' # 특수문자 제거를 위한 정규표현식
response = requests.get(url)
root = lxml.html.fromstring(response.content)
for p in root.xpath('//*[@id="harmonyContainer"]/section/p'):
if p.text: # 체크
body.append(re.sub(punc, '', p.text)) # 특수문자 제거
full_body = ' '.join(body)
return full_body
get_detail('https://news.v.daum.net/v/20200505000102404')
import requests
import lxml.html
import pandas as pd
import sqlite3
from pandas.io import sql
import os
import re
import string
def get_detail(url):
body = []
punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·]'
response = requests.get(url)
root = lxml.html.fromstring(response.content)
for p in root.xpath('//*[@id="harmonyContainer"]/section/p'):
if p.text: # 체크
body.append(re.sub(punc, '', p.text)) # 특수문자 제거
full_body = ' '.join(body)
return full_body
page = 1
max_page = 0
REG_DATE = '20200819'
response = requests.get('http://news.daum.net/breakingnews/digital?page={}®Date={}'\
.format(page, REG_DATE))
root = lxml.html.fromstring(response.content)
for li in root.xpath('//*[@id="mArticle"]/div[3]/ul/li'):
a = li.xpath('div/strong/a')[0]
url = a.get('href')
article = get_detail(url)
print(f'URL : {url}')
print(f'TITLE : {a.text}')
print(f'ARTICLE : {article}')
print('-' * 100)
import requests
import lxml.html
import pandas as pd
import sqlite3
from pandas.io import sql
import os
import time
def db_save(NEWS_LIST):
with sqlite3.connect(os.path.join('.','sqliteDB')) as con:
try:
NEWS_LIST.to_sql(name = 'NEWS_LIST', con = con, index = False, if_exists='append')
#if_exists : {'fail', 'replace', 'append'} default : fail
except Exception as e:
print(str(e))
print(len(NEWS_LIST), '건 저장완료..')
def db_delete():
with sqlite3.connect(os.path.join('.','sqliteDB')) as con:
try:
cur = con.cursor()
sql = 'DELETE FROM NEWS_LIST'
cur.execute(sql)
except Exception as e:
print(str(e))
def db_select():
with sqlite3.connect(os.path.join('.','sqliteDB')) as con:
try:
query = 'SELECT * FROM NEWS_LIST'
NEWS_LIST = pd.read_sql(query, con = con)
except Exception as e:
print(str(e))
return NEWS_LIST
import re
import string
def get_detail(url):
body = []
punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·]'
response = requests.get(url)
root = lxml.html.fromstring(response.content)
for p in root.xpath('//*[@id="harmonyContainer"]/section/p'):
if p.text: # 체크
body.append(re.sub(punc, '', p.text)) # 특수문자 제거
full_body = ' '.join(body)
return full_body
page = 58
max_page = 0
REG_DATE = '20200819'
while(True):
df_list = []
response = requests.get('http://news.daum.net/breakingnews/digital?page={}®Date={}'\
.format(page, REG_DATE))
root = lxml.html.fromstring(response.content)
for li in root.xpath('//*[@id="mArticle"]/div[3]/ul/li'):
a = li.xpath('div/strong/a')[0]
url = a.get('href')
article = get_detail(url)
df = pd.DataFrame({'URL' : [url],'TITLE':[a.text],'ARTICLE' : [article]})
df_list.append(df)
if df_list:
df_10 = pd.concat(df_list)
db_save(df_10)
# 페이지 번호 중에서 max 페이지 가져오기
for a in root.xpath('//*[@id="mArticle"]/div[3]/div/span/a'):
try:
num = int(a.text)
if max_page < num:
max_page = num
except:
pass
# 마지막 페이지 여부 확인
span = root.xpath('//*[@id="mArticle"]/div[3]/div/span/a[@class="btn_page btn_next"]')
if (len(span) <= 0) & (page > max_page):
break
else:
page = page + 1
time.sleep(1)
print(db_select())
반응형
'Language > Python' 카테고리의 다른 글
'cp949' codec can't decode byte 0xee in position 15: illegal multibyte sequence (0) | 2020.09.13 |
---|---|
파이썬 데이터프레임 결합하기(concat) (0) | 2020.09.10 |
파이썬 folium 라이브러리 예제 (0) | 2020.09.10 |
파이썬 셀레늄(Selenium)을 이용한 크롤링하기 (0) | 2020.09.02 |
파이썬 퍼머 링크 목록 추출하기 (2) | 2020.09.02 |
파이썬 SQLite3 DBMS로 저장하기 (0) | 2020.09.01 |
파이썬 XML을 이용하여 기상청 데이터 스크래핑하기 (0) | 2020.09.01 |
파이썬 정규 표현식을 사용하기 위한 re 모듈 (0) | 2020.09.01 |
최근댓글