본문 바로가기
Frontend/Python

python 9일차 / 웹스크래핑 풀버전

by SOLYI 2023. 1. 11.

폴더 구조

main.py

extractors / indeed.py

extractors / wwr.py

 

설치

$ pip install beautifulsoup4
$ pip install selenium
# main.py

from extractors.wwr import extract_wwr_jobs
from extractors.indeed import extract_indeed_job

keyword = input("What do you want to search for? ")

file = open(f"{keyword}.csv", "w", encoding="utf-8-sig")

wwr = extract_wwr_jobs(keyword)
indeed = extract_indeed_job(keyword)
jobs = indeed + wwr

file.write("Position,Company,Locatixon,URL\n")

for job in jobs:
  file.write(f"{job['position']},{job['company']},{job['location']},{job['link']}\n")
  
file.close()
## extractors/wwr.py

from requests import get
from bs4 import BeautifulSoup

def extract_wwr_jobs(keyword):
  base_url ="https://weworkremotely.com/remote-jobs/search?term="
  reseponse = get(f"{base_url}{keyword}")
  # 
  if reseponse.status_code != 200:
    print("Can't request website")
  else:
    results = []

    soup = BeautifulSoup(reseponse.text, "html.parser")
    jobs = soup.find_all('section', class_="jobs")
    # print(len(jobs))
    for job_section in jobs:
      job_posts = job_section.find_all('li')
      job_posts.pop(-1)
      for post in job_posts:
        anchors = post.find_all('a')
        anchor = anchors[1]
        link = anchor['href']

        company, kind, region = anchor.find_all('span', class_="company")

        title = anchor.find('span', class_="title")
        # print(company.string, kind.string, region.string, title.string)
        # print("///////////////////")

        job_data = {
          "link": f"https://weworkremotely.com/{link}",
          "company": company.string.replace(',', ' '),
          "location": region.string.replace(',', ' '),
          "position": title.string.replace(',', ' ')
        }
        results.append(job_data)

    # print(results)
    # for result in results:
    #   print(result)
    #   print("///////")
    return results
## extractors/indeed.py

from requests import get
from bs4 import BeautifulSoup
# from extractors.wwr import extract_wwr_jobs # function 을 import 
# from 폴더명.파일명 import function명

# jobs = extract_wwr_jobs("python")
# print(jobs)

# base_url = "https://kr.indeed.com/jobs?q="
# search_term = "python"

# response = get(f"{base_url}{search_term}")

# print(response)
# if response.status_code != 200:
#   print("Fail")
# else:
#   print(response.text)


# 403 권한없음이 표시된다.
# $ pip install selenium 
# url download -> https://sites.google.com/a/chromium.org/chromedriver/downloads
# 크롬드라이버는 내가 사용중인 크롬과 같은 버전을 다운로드 해야한다. 
# 안될시 작성중인 py 와 같은 경로에 chromedriver.exe를 추가한다.


from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
browser = webdriver.Chrome(options=options)

def extract_indeed_job(keyword):
  pages = get_page_count(keyword)
  print("found pages:", pages)
  results = []

  for page in range(pages):
      
    browser.get(f"https://kr.indeed.com/jobs?q={keyword}&start={page * 10}")


    soup = BeautifulSoup(browser.page_source, "html.parser")
    job_list = soup.find("ul", class_="jobsearch-ResultsList")
    jobs = job_list.find_all("li", recursive=False) 
    # recursive가 없으면 하위 모든 li를 찾는다. False를 추가하면 바로 하위의 li만!
    # print(len(jobs))

    for job in jobs:
      zone = job.find("div", class_="mosaic-zone")
      if zone == None:  # None 은 데이터가 없다는 의미의 데이터 타입. False 와는 다르다.
        # print("job li")
      # else:
      #   print("mosaic li")
        anchor = job.select_one("h2 a")     # 다음 두 줄을 select 로 한번에 가져올 수 있다.
        # h2 = job.find("h2", class_="jobTitle")
        # a = h2.find("a")
        title = anchor['aria-label']
        link = anchor['href']
        # print(title, link)
        # print("\n\n\n")
        company = job.find("span", class_="companyName").string
        location = job.find("div", class_="companyLocation").string

        # print(company, location)

        job_data = {
          "link" : f"https://kr.indeed.com{link}",
          "company" : company.replace(',', ' '),
          "location" : location.replace(',', ' '),
          "position" : title.replace(',', ' '),
        }

        # print(job_data)
        # print("\n\n\n")
        results.append(job_data)

  print(len(results))
  # for result in results:
  #   print(result)

  return results
  # print(job_list)

  # 크롬이 자꾸 꺼질때는 코드 제일 하단에 다음 코드를 추가해준다.
  while (True):
    pass

def get_page_count(keyword):
  browser.get(f"https://kr.indeed.com/jobs?q={keyword}&limit=50")
  soup = BeautifulSoup(browser.page_source, "html.parser")
  pagination = soup.find("nav", role="navigation")
  
  pages = pagination.find_all("div", recursive=False)
  count = len(pages)
  if count == 0:
    return 1
  elif count >= 5:
    return 5
  else:
    return count

# jobs = extract_indeed_job("python")

# print(jobs)
# print(get_page_count("python"))
# print(get_page_count("nest"))
# print(get_page_count("django"))
반응형