动态爬虫

动态爬虫

主函数:

import scrapy
from scrapy.spiders import Spider
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from pydispatch import dispatcher
from scrapy import signals
from ..items import PicItem # 根据你的项目结构调整导入路径
import time

class MirrowSpider(Spider):
name = "moving_pic"
allowed_domains = ["dimtown.com"]
start_urls = ["https://dimtown.com/jxmt"]
title_count = 0 # 添加计数器
title_limit = 200 # 设置限制

def __init__(self, *args, **kwargs):
super(MirrowSpider, self).__init__(*args, **kwargs)
self.driver = webdriver.Chrome()
dispatcher.connect(self.spider_closed, signals.spider_closed)

def spider_closed(self, spider):
self.driver.quit()

def parse(self, response):
# 打印当前使用的 User-Agent
user_agent = response.request.headers.get('User-Agent').decode('utf-8')
self.logger.info(f"当前使用的 User-Agent: {user_agent}")

self.driver.get(response.url)
wait = WebDriverWait(self.driver, 20) # 增加等待时间

try:
# 找到并点击“评论最多”链接
comments_link = wait.until(EC.element_to_be_clickable((By.XPATH, '//a[@data-orderby="comment_count"]')))
self.logger.info("找到‘评论最多’链接并点击")
comments_link.click()

# 获取页面的初始高度
last_height = self.driver.execute_script("return document.body.scrollHeight")

while True:
# 向下滚动页面
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
self.logger.info("页面向下滚动")
time.sleep(5) # 等待页面加载
new_height = self.driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height

# 提取所有详情页链接
links = self.driver.find_elements(By.XPATH, '//a[contains(@href, ".html")]')
detail_urls = [link.get_attribute('href') for link in links]
self.logger.info("所有详情页链接:%s", detail_urls)

for url in detail_urls:
yield scrapy.Request(url, callback=self.parse_detail)

except Exception as e:
self.logger.error("在解析初始页面时发生错误:%s", e)

def parse_detail(self, response):
# 打印当前使用的 User-Agent
user_agent = response.request.headers.get('User-Agent').decode('utf-8')
self.logger.info(f"当前使用的 User-Agent: {user_agent}")

self.driver.get(response.url)
wait = WebDriverWait(self.driver, 20) # 增加等待时间

try:
title = wait.until(EC.visibility_of_element_located((By.XPATH, '//h1'))).text.strip()
self.logger.info("标题:%s", title)

img_urls = [img.get_attribute('src') for img in self.driver.find_elements(By.XPATH, '//img[@decoding="async"]')]
self.logger.info("所有图片网址:%s", img_urls)

if not img_urls:
self.logger.warning("未找到任何图片网址。")

self.title_count += 1 # 增加计数器
if self.title_count >= self.title_limit:
self.crawler.engine.close_spider(self, '达到标题限制,爬虫停止')

item = PicItem(image_urls=img_urls, title=title)
yield item

except Exception as e:
self.logger.error("解析详情页时发生错误:%s", e)

pipeline函数:

import os
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from urllib.parse import urlparse

class CustomImagePipeline(ImagesPipeline):

def get_media_requests(self, item, info):
for image_url in item.get('image_urls', []):
yield scrapy.Request(image_url, meta={'item': item})

def file_path(self, request, response=None, info=None, *, item=None):
item = request.meta['item']
title = item.get('title', 'default_title').replace(' ', '_')
parsed_url = urlparse(request.url)
image_name = os.path.basename(parsed_url.path)
return f'{title}/{image_name}'

def item_completed(self, results, item, info):
if not results:
return item

image_paths = [x['path'] for ok, x in results if ok]
item['images'] = image_paths

return item

middleware函数:

from scrapy.downloadermiddlewares.redirect import RedirectMiddleware

from fake_useragent import UserAgent


class CustomRedirectMiddleware(RedirectMiddleware):
def _redirect(self, redirected, request, spider, reason):
redirected = redirected.replace(url=request.url)
return super()._redirect(redirected, request, spider, reason)


class RandomUserAgentMiddleware(object):
def __init__(self):
self.ua = UserAgent()

def process_request(self, request, spider):
user_agent = self.ua.random
request.headers['User-Agent'] = user_agent
request.headers['Accept'] = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
request.headers['Accept-Language'] = "en"
request.headers['Referer'] = 'https://dimtown.com/cosplay/page/1'#需求修改

settings函数:

# Scrapy settings for moving project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = "moving"

SPIDER_MODULES = ["moving.spiders"]
NEWSPIDER_MODULE = "moving.spiders"
# 导入必要的库

from shutil import which
from selenium.webdriver.chrome.service import Service

# 配置Selenium
SELENIUM_DRIVER_NAME = 'chrome'
SELENIUM_DRIVER_SERVICE_ARGS = ['--log-path=/path/to/chromedriver.log']
SELENIUM_DRIVER_ARGUMENTS = ['--headless'] # 如果需要无头浏览器模式

SELENIUM_SERVICE = Service(which('chromedriver')) # 使用Service类


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "moving (+http://www.yourdomain.com)"

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# settings.py

DOWNLOAD_DELAY = 3 # 每个请求之间的延迟时间(秒)
RANDOMIZE_DOWNLOAD_DELAY = True # 随机化下载延迟

# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
# "Accept-Language": "en",
#}

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# "moving.middlewares.MovingSpiderMiddleware": 543,
#}

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'scrapy_selenium.SeleniumMiddleware': 800,
'moving.middlewares.RandomUserAgentMiddleware': 800,
}

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# "scrapy.extensions.telnet.TelnetConsole": None,
#}

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'moving.pipelines.MyImagePipeline': 1,

}

IMAGES_STORE = 'moving'
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = "httpcache"
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

# Set settings whose default value is deprecated to a future-proof value
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
# settings.py

AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 1 # 初始下载延迟(秒)
AUTOTHROTTLE_MAX_DELAY = 5 # 在高负载情况下的最大下载延迟(秒)
AUTOTHROTTLE_TARGET_CONCURRENCY = 3.0 # 每秒发送的请求数
AUTOTHROTTLE_DEBUG = False # 显示AutoThrottle的调试信息


# settings.py

RETRY_ENABLED = True
RETRY_TIMES = 5 # 重试次数
RETRY_HTTP_CODES = [429, 500, 502, 503, 504, 522, 524, 408] # 重试的HTTP状态码
RETRY_DELAY = 5 # 重试延迟时间(秒)

item函数:

import scrapy


class PicItem(scrapy.Item):
title = scrapy.Field()
image_urls = scrapy.Field()
images = scrapy.Field()

start函数:


from scrapy import cmdline
cmdline.execute('scrapy crawl moving_pic'.split(' '))

效果:

Comments

No comments yet. Why don’t you start the discussion?

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注