Skip to content

下载图片出错 #27

@cosisoft

Description

@cosisoft

[root@localhost woaidu_crawler]# scrapy crawl woaidu
Unhandled error in Deferred:
Unhandled Error
Traceback (most recent call last):
File "/usr/lib64/python2.7/site-packages/scrapy/commands/crawl.py", line 57, in run
self.crawler_process.crawl(spname, *_opts.spargs)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 163, in crawl
return self._crawl(crawler, *args, *_kwargs)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 167, in _crawl
d = crawler.crawl(_args, *_kwargs)
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1181, in unwindGenerator
return _inlineCallbacks(None, gen, Deferred())
--- ---
File "/usr/lib64/python2.7/site-packages/twisted/internet/defer.py", line 1039, in _inlineCallbacks
result = g.send(result)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 90, in crawl
six.reraise(*exc_info)
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 72, in crawl
self.engine = self._create_engine()
File "/usr/lib64/python2.7/site-packages/scrapy/crawler.py", line 97, in _create_engine
return ExecutionEngine(self, lambda _: self.stop())
File "/usr/lib64/python2.7/site-packages/scrapy/core/engine.py", line 69, in init
self.scraper = Scraper(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/core/scraper.py", line 71, in init
self.itemproc = itemproc_cls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 58, in from_crawler
return cls.from_settings(crawler.settings, crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/middleware.py", line 36, in from_settings
mw = mwcls.from_crawler(crawler)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/media.py", line 51, in from_crawler
pipe = cls.from_settings(crawler.settings)
File "/usr/lib64/python2.7/site-packages/scrapy/pipelines/images.py", line 95, in from_settings
return cls(store_uri, settings=settings)
exceptions.TypeError: init() got an unexpected keyword argument 'settings'


settting.py内容如下:

!/usr/bin/python

--coding:utf-8--

Scrapy settings for woaidu_crawler project

import os

PROJECT_DIR = os.path.abspath(os.path.dirname(file))

BOT_NAME = 'woaidu_crawler'

SPIDER_MODULES = ['woaidu_crawler.spiders']
NEWSPIDER_MODULE = 'woaidu_crawler.spiders'

DOWNLOAD_DELAY = 1
CONCURRENT_ITEMS = 100
CONCURRENT_REQUESTS = 16

The maximum number of concurrent (ie. simultaneous) requests that will be performed to any single domain.

CONCURRENT_REQUESTS_PER_DOMAIN = 8
CONCURRENT_REQUESTS_PER_IP = 0
DEPTH_LIMIT = 0
DEPTH_PRIORITY = 0
DNSCACHE_ENABLED = True

DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'

SCHEDULER = 'scrapy.core.scheduler.Scheduler'

AutoThrottle extension

AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 3.0
AUTOTHROTTLE_CONCURRENCY_CHECK_PERIOD = 10#How many responses should pass to perform concurrency adjustments.

XXX:scrapy's item pipelines have orders!!!!!,it will go through all the pipelines by the order of the list;

So if you change the item and return it,the new item will transfer to the next pipeline.

XXX:notice:

if you want to use shard mongodb,you need MongodbWoaiduBookFile and ShardMongodbPipeline

if you want to use single mongodb,you need WoaiduBookFile and SingleMongodbPipeline

ITEM_PIPELINES = ['woaidu_crawler.pipelines.cover_image.WoaiduCoverImage',

'woaidu_crawler.pipelines.bookfile.WoaiduBookFile',

'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile',

'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile',

'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline',

'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline',

'woaidu_crawler.pipelines.final_test.FinalTestPipeline',]

ITEM_PIPELINES = ['woaidu_crawler.pipelines.WoaiduBookFile',]

ITEM_PIPELINES = {'woaidu_crawler.pipelines.cover_image.WoaiduCoverImage':300,'woaidu_crawler.pipelines.mongodb_book_file.MongodbWoaiduBookFile':400, 'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile':500,'woaidu_crawler.pipelines.mongodb.ShardMongodbPipeline':600, 'woaidu_crawler.pipelines.final_test.FinalTestPipeline':700,}

ITEM_PIPELINES = {'woaidu_crawler.pipelines.cover_image.WoaiduCoverImage':300,

'woaidu_crawler.pipelines.bookfile.WoaiduBookFile':400,

'woaidu_crawler.pipelines.drop_none_download.DropNoneBookFile':500,

'woaidu_crawler.pipelines.mongodb.SingleMongodbPipeline':600,

'woaidu_crawler.pipelines.final_test.FinalTestPipeline':700,}

IMAGES_STORE = os.path.join(PROJECT_DIR,'media/book_covor_image')
IMAGES_EXPIRES = 30
IMAGES_THUMBS = {
'small': (50, 50),
'big': (270, 270),
}

IMAGES_MIN_HEIGHT = 0
IMAGES_MIN_WIDTH = 0

COOKIES_ENABLED = False

USER_AGENT = 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.31 (KHTML, like Gecko) Chrome/26.0.1410.43 Safari/537.31'

DOWNLOADER_MIDDLEWARES = {

'woaidu_crawler.contrib.downloadmiddleware.google_cache.GoogleCacheMiddleware':50,

'scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware': None,
'woaidu_crawler.contrib.downloadmiddleware.rotate_useragent.RotateUserAgentMiddleware':400,

}

GOOGLE_CACHE_DOMAINS = ['www.woaidu.org',]

To make RotateUserAgentMiddleware enable.

USER_AGENT = ''

FILE_EXPIRES = 30
BOOK_FILE_EXPIRES = 30
FILE_STORE = os.path.join(PROJECT_DIR,'media/files')
BOOK_FILE_STORE = os.path.join(PROJECT_DIR,'media/book_files')

For more mime types about file,you can visit:

http://mimeapplication.net/

BOOK_FILE_CONTENT_TYPE = ['application/file',
'application/zip',
'application/octet-stream',
'application/x-zip-compressed',
'application/x-octet-stream',
'application/gzip',
'application/pdf',
'application/ogg',
'application/vnd.oasis.opendocument.text',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/x-dvi',
'application/x-rar-compressed',
'application/x-tar',
'multipart/x-zip',
'application/x-zip',
'application/x-winzip',
'application/x-compress',
'application/x-compressed',
'application/x-gzip',
'zz-application/zz-winassoc-arj',
'application/x-stuffit',
'application/arj',
'application/x-arj',
'multipart/x-tar',
'text/plain',]

URL_GBK_DOMAIN = ['www.paofuu.com',
'down.wmtxt.com',
'www.txt163.com',
'down.txt163.com',
'down.sjtxt.com:8199',
'file.txtbook.com.cn',
'www.yyytxt.com',
'www.27xs.org',
'down.dusuu.com:8199',
'down.txtqb.cn']
ATTACHMENT_FILENAME_UTF8_DOMAIN = []

FILE_EXTENTION = ['.doc','.txt','.docx','.rar','.zip','.pdf']

Drop_NoneBookFile = True

LOG_FILE = "logs/scrapy.log"

STATS_CLASS = 'woaidu_crawler.statscol.graphite.RedisGraphiteStatsCollector'

GRAPHITE_HOST = '127.0.0.1'
GRAPHITE_PORT = 2003
GRAPHITE_IGNOREKEYS = []

SingleMONGODB_SERVER = "localhost"
SingleMONGODB_PORT = 27017
SingleMONGODB_DB = "books_fs"

ShardMONGODB_SERVER = "localhost"
ShardMONGODB_PORT = 27017
ShardMONGODB_DB = "books_mongo"
GridFs_Collection = "book_file"

SCHEDULER = "woaidu_crawler.scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = False
SCHEDULER_QUEUE_CLASS = 'woaidu_crawler.scrapy_redis.queue.SpiderPriorityQueue'


是否配置不对

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions