IT 신간 서적 PYTHON 파이썬 으로 온라인서적 사이트 크롤링

shimpark/git-action-python

Contribute to shimpark/git-action-python development by creating an account on GitHub.

github.com

1. yes24

import requests
from bs4 import BeautifulSoup


def parsing_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 YES24 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.select(".goodsTxtInfo")
    url_prefix = "http://www.yes24.com"

    for new_book in new_books:
        book_name = new_book.select("a")[0].text
        url_suffix = new_book.select("a")[1].attrs['href']
        url = url_prefix + url_suffix
        price = new_book.select(".priceB")[0].text

        content = f"<a href={url}>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents

2. 인터파크

import requests
from bs4 import BeautifulSoup


def parsing_interpark_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 인터파크 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_interpark_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.select(".displayWrap")

    for new_book in new_books:
        book_name = new_book.select("div.infoWrap")[0].select(
            "p.inc_tit")[0].select("a")[0].select("b")[0].text
        url_suffix = new_book.select("div.infoWrap")[0].select(
            "p.inc_tit")[0].select("a")[0].attrs['href']
        url = url_suffix
        price = new_book.select("div.infoWrap")[0].select(
            "p.inc_price")[0].select("span:nth-child(3)")[0].select("b")[0].text

        content = f"<a href='{url}'>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents

3. 교보문고

import requests
from bs4 import BeautifulSoup


def parsing_kyobo_beautifulsoup(url):
    """
    뷰티풀 수프로 파싱하는 함수
    :param url: paring할 URL. 여기선 교보문고 Link
    :return: BeautifulSoup soup Object
    """

    data = requests.get(url)

    html = data.text
    soup = BeautifulSoup(html, 'html.parser')
    return soup


def extract_kyobo_book_data(soup):
    """
    BeautifulSoup Object에서 book data를 추출하는 함수
    :param soup: BeautifulSoup soup Object
    :return: contents(str)
    """

    upload_contents = ''
    new_books = soup.find_all("div", class_="detail")

    for new_book in new_books:

        book_name = new_book.select("div.title")[0].select("a")[
            0].select("strong")[0].text
        url_suffix = new_book.select("div.title")[0].select("a")[
            0].attrs['href']
        url = url_suffix

        price = new_book.select("strong.sell_price")[0].text

        content = f"<a href='{url}'>" + book_name + \
            "</a>" + ", " + price + "<br/>\n"
        upload_contents += content

    return upload_contents

저작자표시 비영리 동일조건

'프로그래밍 > PYTHON' 카테고리의 다른 글

PYTHON : pandas 로 MS-SQL 연결하여 EXCEL, CSV 생성하기 (0)	2024.08.27
파이썬 python - 2024년 국가공휴일 공공데이터포털 가져오기(data.go.kr) (6)	2024.04.18
파이썬 Python - 학생 성적 처리프로그램(Sqlite3 적용) (4)	2024.03.11
[서평] Do it! 점프 투 장고 (2)	2021.01.18
django.core.exceptions.ImproperlyConfigured: Error loading MySQLdb module (0)	2020.04.08

재우니의 블로그

IT 신간 서적 PYTHON 파이썬 으로 온라인서적 사이트 크롤링

IT 신간 서적 PYTHON 파이썬 으로 온라인서적 사이트 크롤링

'프로그래밍 > PYTHON' 카테고리의 다른 글

티스토리툴바