Selenium爬取欧米奇培训学校官网新闻

数据存储在xls(x)表格文件中
#!/usr/bin/python3
import re
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver
import time
from lxml import etree
import xlrd2
import xlwt


class Spider(object):
    # chromedriver的绝对路径
    driver_path = r'C:\Programs\chromedriver\chromedriver.exe'
    data = []
    line_index = 0
    workbook = xlrd2.open_workbook("新闻.xlsx")
    rsheet = workbook.sheet_by_index(0)

    def __init__(self):
        options = webdriver.ChromeOptions()
        # options.add_argument('-headless')
        prefs = {"profile.managed_default_content_settings.images": 2}
        options.add_experimental_option("prefs", prefs)
        # options.add_argument(r'user-data-dir=C:\projects\python\pythonProject\cache')
        self.driver = webdriver.Chrome(executable_path=Spider.driver_path, options=options, )
        self.url = "http://www.syomq.com/hangyexinwen/"
        try:
            self.driver.set_page_load_timeout(0)
            self.driver.get("chrome://version/")
        except:
            pass
        self.driver.set_page_load_timeout(60)
        self.driver.get(self.url)
        self._repeat()


    def _repeat(self):
        self._get_list()
        if len(self.driver.find_elements_by_xpath('//ul[@class="pagelist"]/li/a[text()="下一页"]')) == 0:
            self.driver.close()
            self.driver.quit()
            with open('data.json', 'w+', encoding='utf-8') as f:
                f.write(str(self.data))
        else:
            todo_eles = self.driver.find_elements_by_xpath('//ul[@class="pagelist"]/li/a')
            for i in todo_eles:
                if i.text in '下一页':
                    url = i.get_attribute('href')
                    self.driver.get(url)
                    break
                else:
                    wwb = xlwt.Workbook(encoding="utf-8")
                    wsheet = wwb.add_sheet("新闻")
                    for row in range(self.rsheet.nrows):
                        for col in range(self.rsheet.ncols):
                            wsheet.write(row, col, self.rsheet.cell_value(row, col))
                    wwb.save("新闻.xlsx")
        self._repeat()

    def _get_list(self):
        WebDriverWait(self.driver, 1000).until(
            EC.presence_of_all_elements_located((By.XPATH, ".//ul[@class='news_ul']/li/div[@class='txt']/h3/a"))
        )
        links = etree.HTML(self.driver.page_source).xpath("//ul[@class='news_ul']/li/div[@class='txt']/h3/a/@href")
        for i in links:
            self._parse_new(i)

    def _parse_new(self, url):
        self.driver.execute_script("window.open('%s')" % url)
        self.driver.switch_to.window(self.driver.window_handles[1])
        html = self.driver.page_source
        title = etree.HTML(html).xpath("//div[@class='w1200']/div[@class='news_left']/h1/text()")[0].strip()
        context = etree.HTML(html).xpath("//div[@class='news_nr']//text()")
        datetime = re.match(".*?(\d+-\d{2}-\d{2}\s\d{2}:\d{2})", etree.HTML(html).xpath("//div[@class='news_dis']/span/text()")[0]).group(1)

        fields = ['标题', '正文', '发布时间']
        self.rsheet.put_cell(self.line_index, 0, xlrd2.XL_CELL_TEXT, title, None)
        self.rsheet.put_cell(self.line_index, 1, xlrd2.XL_CELL_TEXT, datetime, None)
        self.rsheet.put_cell(self.line_index, 2, xlrd2.XL_CELL_TEXT, context, None)
        self.line_index += 1

        self.driver.close()
        self.driver.switch_to.window(self.driver.window_handles[0])

    def run(self):
        self.driver.get(self.url)


spider = Spider()
spider.run()