프로그래밍/PYTHON

IT 신간 서적 PYTHON 파이썬 으로 온라인서적 사이트 크롤링

재우니 2021. 4. 15. 03:01

IT 신간 서적 PYTHON 파이썬 으로 온라인서적 사이트 크롤링

 

 

github.com/shimpark/git-action-python

 

shimpark/git-action-python

Contribute to shimpark/git-action-python development by creating an account on GitHub.

github.com

 

1. yes24

 

import requests
from bs4 import BeautifulSoup


def parsing_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 YES24 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.select(".goodsTxtInfo")
    url_prefix = "http://www.yes24.com"

    for new_book in new_books:
        book_name = new_book.select("a")[0].text
        url_suffix = new_book.select("a")[1].attrs['href']
        url = url_prefix + url_suffix
        price = new_book.select(".priceB")[0].text

        content = f"<a href={url}>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents

 

2. 인터파크

 

import requests
from bs4 import BeautifulSoup


def parsing_interpark_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 인터파크 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_interpark_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.select(".displayWrap")

    for new_book in new_books:
        book_name = new_book.select("div.infoWrap")[0].select(
            "p.inc_tit")[0].select("a")[0].select("b")[0].text
        url_suffix = new_book.select("div.infoWrap")[0].select(
            "p.inc_tit")[0].select("a")[0].attrs['href']
        url = url_suffix
        price = new_book.select("div.infoWrap")[0].select(
            "p.inc_price")[0].select("span:nth-child(3)")[0].select("b")[0].text

        content = f"<a href='{url}'>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents

 

3. 교보문고

 

import requests
from bs4 import BeautifulSoup


def parsing_kyobo_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 교보문고 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_kyobo_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.find_all("div", class_="detail")

    for new_book in new_books:

        book_name = new_book.select("div.title")[0].select("a")[
            0].select("strong")[0].text
        url_suffix = new_book.select("div.title")[0].select("a")[
            0].attrs['href']
        url = url_suffix

        price = new_book.select("strong.sell_price")[0].text

        content = f"<a href='{url}'>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents