1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
| import re from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.common.by import By from selenium import webdriver import time from lxml import etree import xlrd2 import xlwt
class Spider(object): driver_path = r'C:\Programs\chromedriver\chromedriver.exe' data = [] line_index = 0 workbook = xlrd2.open_workbook("新闻.xlsx") rsheet = workbook.sheet_by_index(0)
def __init__(self): options = webdriver.ChromeOptions() prefs = {"profile.managed_default_content_settings.images": 2} options.add_experimental_option("prefs", prefs) self.driver = webdriver.Chrome(executable_path=Spider.driver_path, options=options, ) self.url = "http://www.syomq.com/hangyexinwen/" try: self.driver.set_page_load_timeout(0) self.driver.get("chrome://version/") except: pass self.driver.set_page_load_timeout(60) self.driver.get(self.url) self._repeat()
def _repeat(self): self._get_list() if len(self.driver.find_elements_by_xpath('//ul[@class="pagelist"]/li/a[text()="下一页"]')) == 0: self.driver.close() self.driver.quit() with open('data.json', 'w+', encoding='utf-8') as f: f.write(str(self.data)) else: todo_eles = self.driver.find_elements_by_xpath('//ul[@class="pagelist"]/li/a') for i in todo_eles: if i.text in '下一页': url = i.get_attribute('href') self.driver.get(url) break else: wwb = xlwt.Workbook(encoding="utf-8") wsheet = wwb.add_sheet("新闻") for row in range(self.rsheet.nrows): for col in range(self.rsheet.ncols): wsheet.write(row, col, self.rsheet.cell_value(row, col)) wwb.save("新闻.xlsx") self._repeat()
def _get_list(self): WebDriverWait(self.driver, 1000).until( EC.presence_of_all_elements_located((By.XPATH, ".//ul[@class='news_ul']/li/div[@class='txt']/h3/a")) ) links = etree.HTML(self.driver.page_source).xpath("//ul[@class='news_ul']/li/div[@class='txt']/h3/a/@href") for i in links: self._parse_new(i)
def _parse_new(self, url): self.driver.execute_script("window.open('%s')" % url) self.driver.switch_to.window(self.driver.window_handles[1]) html = self.driver.page_source title = etree.HTML(html).xpath("//div[@class='w1200']/div[@class='news_left']/h1/text()")[0].strip() context = etree.HTML(html).xpath("//div[@class='news_nr']//text()") datetime = re.match(".*?(\d+-\d{2}-\d{2}\s\d{2}:\d{2})", etree.HTML(html).xpath("//div[@class='news_dis']/span/text()")[0]).group(1)
fields = ['标题', '正文', '发布时间'] self.rsheet.put_cell(self.line_index, 0, xlrd2.XL_CELL_TEXT, title, None) self.rsheet.put_cell(self.line_index, 1, xlrd2.XL_CELL_TEXT, datetime, None) self.rsheet.put_cell(self.line_index, 2, xlrd2.XL_CELL_TEXT, context, None) self.line_index += 1
self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0])
def run(self): self.driver.get(self.url)
spider = Spider() spider.run()
|