From 5e997587d9b13344a0afa9bb4cf781829a66ce23 Mon Sep 17 00:00:00 2001 From: Andrey Rahmatullin Date: Sun, 20 Sep 2020 18:06:46 +0500 Subject: [PATCH] Remove dead boto2 code, deprecate is_botocore() (#4776) --- scrapy/core/downloader/handlers/s3.py | 55 ++++----------- scrapy/extensions/feedexport.py | 35 ++++------ scrapy/pipelines/files.py | 97 +++++++++------------------ scrapy/utils/boto.py | 23 ++++++- scrapy/utils/test.py | 29 +++----- tests/test_feedexport.py | 69 ++----------------- tests/test_pipeline_files.py | 15 ++--- 7 files changed, 96 insertions(+), 227 deletions(-) diff --git a/scrapy/core/downloader/handlers/s3.py b/scrapy/core/downloader/handlers/s3.py index 0ef977893db..1966570d4c1 100644 --- a/scrapy/core/downloader/handlers/s3.py +++ b/scrapy/core/downloader/handlers/s3.py @@ -2,41 +2,20 @@ from scrapy.core.downloader.handlers.http import HTTPDownloadHandler from scrapy.exceptions import NotConfigured -from scrapy.utils.boto import is_botocore +from scrapy.utils.boto import is_botocore_available from scrapy.utils.httpobj import urlparse_cached from scrapy.utils.misc import create_instance -def _get_boto_connection(): - from boto.s3.connection import S3Connection - - class _v19_S3Connection(S3Connection): - """A dummy S3Connection wrapper that doesn't do any synchronous download""" - def _mexe(self, method, bucket, key, headers, *args, **kwargs): - return headers - - class _v20_S3Connection(S3Connection): - """A dummy S3Connection wrapper that doesn't do any synchronous download""" - def _mexe(self, http_request, *args, **kwargs): - http_request.authorize(connection=self) - return http_request.headers - - try: - import boto.auth # noqa: F401 - except ImportError: - _S3Connection = _v19_S3Connection - else: - _S3Connection = _v20_S3Connection - - return _S3Connection - - class S3DownloadHandler: def __init__(self, settings, *, crawler=None, aws_access_key_id=None, aws_secret_access_key=None, httpdownloadhandler=HTTPDownloadHandler, **kw): + if not is_botocore_available(): + raise NotConfigured('missing botocore library') + if not aws_access_key_id: aws_access_key_id = settings['AWS_ACCESS_KEY_ID'] if not aws_secret_access_key: @@ -51,23 +30,15 @@ def __init__(self, settings, *, self.anon = kw.get('anon') self._signer = None - if is_botocore(): - import botocore.auth - import botocore.credentials - kw.pop('anon', None) - if kw: - raise TypeError(f'Unexpected keyword arguments: {kw}') - if not self.anon: - SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3'] - self._signer = SignerCls(botocore.credentials.Credentials( - aws_access_key_id, aws_secret_access_key)) - else: - _S3Connection = _get_boto_connection() - try: - self.conn = _S3Connection( - aws_access_key_id, aws_secret_access_key, **kw) - except Exception as ex: - raise NotConfigured(str(ex)) + import botocore.auth + import botocore.credentials + kw.pop('anon', None) + if kw: + raise TypeError(f'Unexpected keyword arguments: {kw}') + if not self.anon: + SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3'] + self._signer = SignerCls(botocore.credentials.Credentials( + aws_access_key_id, aws_secret_access_key)) _http_handler = create_instance( objcls=httpdownloadhandler, diff --git a/scrapy/extensions/feedexport.py b/scrapy/extensions/feedexport.py index 980825499ce..9f712285ffb 100644 --- a/scrapy/extensions/feedexport.py +++ b/scrapy/extensions/feedexport.py @@ -19,7 +19,7 @@ from scrapy import signals from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning -from scrapy.utils.boto import is_botocore +from scrapy.utils.boto import is_botocore_available from scrapy.utils.conf import feed_complete_default_values_from_settings from scrapy.utils.ftp import ftp_store_file from scrapy.utils.log import failure_to_exc_info @@ -120,22 +120,19 @@ class S3FeedStorage(BlockingFeedStorage): def __init__(self, uri, access_key=None, secret_key=None, acl=None, *, feed_options=None): + if not is_botocore_available(): + raise NotConfigured('missing botocore library') u = urlparse(uri) self.bucketname = u.hostname self.access_key = u.username or access_key self.secret_key = u.password or secret_key - self.is_botocore = is_botocore() self.keyname = u.path[1:] # remove first "/" self.acl = acl - if self.is_botocore: - import botocore.session - session = botocore.session.get_session() - self.s3_client = session.create_client( - 's3', aws_access_key_id=self.access_key, - aws_secret_access_key=self.secret_key) - else: - import boto - self.connect_s3 = boto.connect_s3 + import botocore.session + session = botocore.session.get_session() + self.s3_client = session.create_client( + 's3', aws_access_key_id=self.access_key, + aws_secret_access_key=self.secret_key) if feed_options and feed_options.get('overwrite', True) is False: logger.warning('S3 does not support appending to files. To ' 'suppress this warning, remove the overwrite ' @@ -154,18 +151,10 @@ def from_crawler(cls, crawler, uri, *, feed_options=None): def _store_in_thread(self, file): file.seek(0) - if self.is_botocore: - kwargs = {'ACL': self.acl} if self.acl else {} - self.s3_client.put_object( - Bucket=self.bucketname, Key=self.keyname, Body=file, - **kwargs) - else: - conn = self.connect_s3(self.access_key, self.secret_key) - bucket = conn.get_bucket(self.bucketname, validate=False) - key = bucket.new_key(self.keyname) - kwargs = {'policy': self.acl} if self.acl else {} - key.set_contents_from_file(file, **kwargs) - key.close() + kwargs = {'ACL': self.acl} if self.acl else {} + self.s3_client.put_object( + Bucket=self.bucketname, Key=self.keyname, Body=file, + **kwargs) file.close() diff --git a/scrapy/pipelines/files.py b/scrapy/pipelines/files.py index 99a72aa707f..13ecd4e6c59 100644 --- a/scrapy/pipelines/files.py +++ b/scrapy/pipelines/files.py @@ -11,7 +11,6 @@ import time from collections import defaultdict from contextlib import suppress -from email.utils import mktime_tz, parsedate_tz from ftplib import FTP from io import BytesIO from urllib.parse import urlparse @@ -23,7 +22,7 @@ from scrapy.http import Request from scrapy.pipelines.media import MediaPipeline from scrapy.settings import Settings -from scrapy.utils.boto import is_botocore +from scrapy.utils.boto import is_botocore_available from scrapy.utils.datatypes import CaselessDict from scrapy.utils.ftp import ftp_store_file from scrapy.utils.log import failure_to_exc_info @@ -91,86 +90,54 @@ class S3FilesStore: } def __init__(self, uri): - self.is_botocore = is_botocore() - if self.is_botocore: - import botocore.session - session = botocore.session.get_session() - self.s3_client = session.create_client( - 's3', - aws_access_key_id=self.AWS_ACCESS_KEY_ID, - aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY, - endpoint_url=self.AWS_ENDPOINT_URL, - region_name=self.AWS_REGION_NAME, - use_ssl=self.AWS_USE_SSL, - verify=self.AWS_VERIFY - ) - else: - from boto.s3.connection import S3Connection - self.S3Connection = S3Connection + if not is_botocore_available(): + raise NotConfigured('missing botocore library') + import botocore.session + session = botocore.session.get_session() + self.s3_client = session.create_client( + 's3', + aws_access_key_id=self.AWS_ACCESS_KEY_ID, + aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY, + endpoint_url=self.AWS_ENDPOINT_URL, + region_name=self.AWS_REGION_NAME, + use_ssl=self.AWS_USE_SSL, + verify=self.AWS_VERIFY + ) if not uri.startswith("s3://"): raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'") self.bucket, self.prefix = uri[5:].split('/', 1) def stat_file(self, path, info): def _onsuccess(boto_key): - if self.is_botocore: - checksum = boto_key['ETag'].strip('"') - last_modified = boto_key['LastModified'] - modified_stamp = time.mktime(last_modified.timetuple()) - else: - checksum = boto_key.etag.strip('"') - last_modified = boto_key.last_modified - modified_tuple = parsedate_tz(last_modified) - modified_stamp = int(mktime_tz(modified_tuple)) + checksum = boto_key['ETag'].strip('"') + last_modified = boto_key['LastModified'] + modified_stamp = time.mktime(last_modified.timetuple()) return {'checksum': checksum, 'last_modified': modified_stamp} return self._get_boto_key(path).addCallback(_onsuccess) - def _get_boto_bucket(self): - # disable ssl (is_secure=False) because of this python bug: - # https://bugs.python.org/issue5103 - c = self.S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False) - return c.get_bucket(self.bucket, validate=False) - def _get_boto_key(self, path): key_name = f'{self.prefix}{path}' - if self.is_botocore: - return threads.deferToThread( - self.s3_client.head_object, - Bucket=self.bucket, - Key=key_name) - else: - b = self._get_boto_bucket() - return threads.deferToThread(b.get_key, key_name) + return threads.deferToThread( + self.s3_client.head_object, + Bucket=self.bucket, + Key=key_name) def persist_file(self, path, buf, info, meta=None, headers=None): """Upload file to S3 storage""" key_name = f'{self.prefix}{path}' buf.seek(0) - if self.is_botocore: - extra = self._headers_to_botocore_kwargs(self.HEADERS) - if headers: - extra.update(self._headers_to_botocore_kwargs(headers)) - return threads.deferToThread( - self.s3_client.put_object, - Bucket=self.bucket, - Key=key_name, - Body=buf, - Metadata={k: str(v) for k, v in (meta or {}).items()}, - ACL=self.POLICY, - **extra) - else: - b = self._get_boto_bucket() - k = b.new_key(key_name) - if meta: - for metakey, metavalue in meta.items(): - k.set_metadata(metakey, str(metavalue)) - h = self.HEADERS.copy() - if headers: - h.update(headers) - return threads.deferToThread( - k.set_contents_from_string, buf.getvalue(), - headers=h, policy=self.POLICY) + extra = self._headers_to_botocore_kwargs(self.HEADERS) + if headers: + extra.update(self._headers_to_botocore_kwargs(headers)) + return threads.deferToThread( + self.s3_client.put_object, + Bucket=self.bucket, + Key=key_name, + Body=buf, + Metadata={k: str(v) for k, v in (meta or {}).items()}, + ACL=self.POLICY, + **extra) def _headers_to_botocore_kwargs(self, headers): """ Convert headers to botocore keyword agruments. diff --git a/scrapy/utils/boto.py b/scrapy/utils/boto.py index 12321caa5d1..3374c57c7e9 100644 --- a/scrapy/utils/boto.py +++ b/scrapy/utils/boto.py @@ -1,11 +1,32 @@ """Boto/botocore helpers""" +import warnings -from scrapy.exceptions import NotConfigured +from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning def is_botocore(): + """ Returns True if botocore is available, otherwise raises NotConfigured. Never returns False. + + Previously, when boto was supported in addition to botocore, this returned False if boto was available + but botocore wasn't. + """ + message = ( + 'is_botocore() is deprecated and always returns True or raises an Exception, ' + 'so it cannot be used for checking if boto is available instead of botocore. ' + 'You can use scrapy.utils.boto.is_botocore_available() to check if botocore ' + 'is available.' + ) + warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2) try: import botocore # noqa: F401 return True except ImportError: raise NotConfigured('missing botocore library') + + +def is_botocore_available(): + try: + import botocore # noqa: F401 + return True + except ImportError: + return False diff --git a/scrapy/utils/test.py b/scrapy/utils/test.py index f54942ffb13..94d0ae2d355 100644 --- a/scrapy/utils/test.py +++ b/scrapy/utils/test.py @@ -10,8 +10,7 @@ from importlib import import_module from twisted.trial.unittest import SkipTest -from scrapy.exceptions import NotConfigured -from scrapy.utils.boto import is_botocore +from scrapy.utils.boto import is_botocore_available def assert_aws_environ(): @@ -29,29 +28,19 @@ def assert_gcs_environ(): def skip_if_no_boto(): - try: - is_botocore() - except NotConfigured as e: - raise SkipTest(e) + if not is_botocore_available(): + raise SkipTest('missing botocore library') def get_s3_content_and_delete(bucket, path, with_key=False): """ Get content from s3 key, and delete key afterwards. """ - if is_botocore(): - import botocore.session - session = botocore.session.get_session() - client = session.create_client('s3') - key = client.get_object(Bucket=bucket, Key=path) - content = key['Body'].read() - client.delete_object(Bucket=bucket, Key=path) - else: - import boto - # assuming boto=2.2.2 - bucket = boto.connect_s3().get_bucket(bucket, validate=False) - key = bucket.get_key(path) - content = key.get_contents_as_string() - bucket.delete_key(path) + import botocore.session + session = botocore.session.get_session() + client = session.create_client('s3') + key = client.get_object(Bucket=bucket, Key=path) + content = key['Body'].read() + client.delete_object(Bucket=bucket, Key=path) return (content, key) if with_key else content diff --git a/tests/test_feedexport.py b/tests/test_feedexport.py index 840e0f87b92..33ac5171227 100644 --- a/tests/test_feedexport.py +++ b/tests/test_feedexport.py @@ -45,6 +45,7 @@ get_s3_content_and_delete, get_crawler, mock_google_cloud_storage, + skip_if_no_boto, ) from tests.mockserver import MockFTPServer, MockServer @@ -227,10 +228,7 @@ def test_invalid_folder(self): class S3FeedStorageTest(unittest.TestCase): def test_parse_credentials(self): - try: - import botocore # noqa: F401 - except ImportError: - raise unittest.SkipTest("S3FeedStorage requires botocore") + skip_if_no_boto() aws_credentials = {'AWS_ACCESS_KEY_ID': 'settings_key', 'AWS_SECRET_ACCESS_KEY': 'settings_secret'} crawler = get_crawler(settings_dict=aws_credentials) @@ -324,11 +322,7 @@ def test_from_crawler_with_acl(self): @defer.inlineCallbacks def test_store_botocore_without_acl(self): - try: - import botocore # noqa: F401 - except ImportError: - raise unittest.SkipTest('botocore is required') - + skip_if_no_boto() storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -344,11 +338,7 @@ def test_store_botocore_without_acl(self): @defer.inlineCallbacks def test_store_botocore_with_acl(self): - try: - import botocore # noqa: F401 - except ImportError: - raise unittest.SkipTest('botocore is required') - + skip_if_no_boto() storage = S3FeedStorage( 's3://mybucket/export.csv', 'access_key', @@ -366,57 +356,6 @@ def test_store_botocore_with_acl(self): 'custom-acl' ) - @defer.inlineCallbacks - def test_store_not_botocore_without_acl(self): - storage = S3FeedStorage( - 's3://mybucket/export.csv', - 'access_key', - 'secret_key', - ) - self.assertEqual(storage.access_key, 'access_key') - self.assertEqual(storage.secret_key, 'secret_key') - self.assertEqual(storage.acl, None) - - storage.is_botocore = False - storage.connect_s3 = mock.MagicMock() - self.assertFalse(storage.is_botocore) - - yield storage.store(BytesIO(b'test file')) - - conn = storage.connect_s3(*storage.connect_s3.call_args) - bucket = conn.get_bucket(*conn.get_bucket.call_args) - key = bucket.new_key(*bucket.new_key.call_args) - self.assertNotIn( - dict(policy='custom-acl'), - key.set_contents_from_file.call_args - ) - - @defer.inlineCallbacks - def test_store_not_botocore_with_acl(self): - storage = S3FeedStorage( - 's3://mybucket/export.csv', - 'access_key', - 'secret_key', - 'custom-acl' - ) - self.assertEqual(storage.access_key, 'access_key') - self.assertEqual(storage.secret_key, 'secret_key') - self.assertEqual(storage.acl, 'custom-acl') - - storage.is_botocore = False - storage.connect_s3 = mock.MagicMock() - self.assertFalse(storage.is_botocore) - - yield storage.store(BytesIO(b'test file')) - - conn = storage.connect_s3(*storage.connect_s3.call_args) - bucket = conn.get_bucket(*conn.get_bucket.call_args) - key = bucket.new_key(*bucket.new_key.call_args) - self.assertIn( - dict(policy='custom-acl'), - key.set_contents_from_file.call_args - ) - def test_overwrite_default(self): with LogCapture() as log: S3FeedStorage( diff --git a/tests/test_pipeline_files.py b/tests/test_pipeline_files.py index 1dd7031fe77..d5b0bb3d804 100644 --- a/tests/test_pipeline_files.py +++ b/tests/test_pipeline_files.py @@ -22,7 +22,6 @@ S3FilesStore, ) from scrapy.settings import Settings -from scrapy.utils.boto import is_botocore from scrapy.utils.test import ( assert_aws_environ, assert_gcs_environ, @@ -437,16 +436,10 @@ def test_persist(self): content, key = get_s3_content_and_delete( u.hostname, u.path[1:], with_key=True) self.assertEqual(content, data) - if is_botocore(): - self.assertEqual(key['Metadata'], {'foo': 'bar'}) - self.assertEqual( - key['CacheControl'], S3FilesStore.HEADERS['Cache-Control']) - self.assertEqual(key['ContentType'], 'image/png') - else: - self.assertEqual(key.metadata, {'foo': 'bar'}) - self.assertEqual( - key.cache_control, S3FilesStore.HEADERS['Cache-Control']) - self.assertEqual(key.content_type, 'image/png') + self.assertEqual(key['Metadata'], {'foo': 'bar'}) + self.assertEqual( + key['CacheControl'], S3FilesStore.HEADERS['Cache-Control']) + self.assertEqual(key['ContentType'], 'image/png') class TestGCSFilesStore(unittest.TestCase):