Skip to content

Commit

Permalink
Remove dead boto2 code, deprecate is_botocore() (scrapy#4776)
Browse files Browse the repository at this point in the history
  • Loading branch information
wRAR authored Sep 20, 2020
1 parent 85e13af commit 5e99758
Show file tree
Hide file tree
Showing 7 changed files with 96 additions and 227 deletions.
55 changes: 13 additions & 42 deletions scrapy/core/downloader/handlers/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,20 @@

from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.exceptions import NotConfigured
from scrapy.utils.boto import is_botocore
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.httpobj import urlparse_cached
from scrapy.utils.misc import create_instance


def _get_boto_connection():
from boto.s3.connection import S3Connection

class _v19_S3Connection(S3Connection):
"""A dummy S3Connection wrapper that doesn't do any synchronous download"""
def _mexe(self, method, bucket, key, headers, *args, **kwargs):
return headers

class _v20_S3Connection(S3Connection):
"""A dummy S3Connection wrapper that doesn't do any synchronous download"""
def _mexe(self, http_request, *args, **kwargs):
http_request.authorize(connection=self)
return http_request.headers

try:
import boto.auth # noqa: F401
except ImportError:
_S3Connection = _v19_S3Connection
else:
_S3Connection = _v20_S3Connection

return _S3Connection


class S3DownloadHandler:

def __init__(self, settings, *,
crawler=None,
aws_access_key_id=None, aws_secret_access_key=None,
httpdownloadhandler=HTTPDownloadHandler, **kw):
if not is_botocore_available():
raise NotConfigured('missing botocore library')

if not aws_access_key_id:
aws_access_key_id = settings['AWS_ACCESS_KEY_ID']
if not aws_secret_access_key:
Expand All @@ -51,23 +30,15 @@ def __init__(self, settings, *,
self.anon = kw.get('anon')

self._signer = None
if is_botocore():
import botocore.auth
import botocore.credentials
kw.pop('anon', None)
if kw:
raise TypeError(f'Unexpected keyword arguments: {kw}')
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key))
else:
_S3Connection = _get_boto_connection()
try:
self.conn = _S3Connection(
aws_access_key_id, aws_secret_access_key, **kw)
except Exception as ex:
raise NotConfigured(str(ex))
import botocore.auth
import botocore.credentials
kw.pop('anon', None)
if kw:
raise TypeError(f'Unexpected keyword arguments: {kw}')
if not self.anon:
SignerCls = botocore.auth.AUTH_TYPE_MAPS['s3']
self._signer = SignerCls(botocore.credentials.Credentials(
aws_access_key_id, aws_secret_access_key))

_http_handler = create_instance(
objcls=httpdownloadhandler,
Expand Down
35 changes: 12 additions & 23 deletions scrapy/extensions/feedexport.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from scrapy import signals
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning
from scrapy.utils.boto import is_botocore
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.conf import feed_complete_default_values_from_settings
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
Expand Down Expand Up @@ -120,22 +120,19 @@ class S3FeedStorage(BlockingFeedStorage):

def __init__(self, uri, access_key=None, secret_key=None, acl=None, *,
feed_options=None):
if not is_botocore_available():
raise NotConfigured('missing botocore library')
u = urlparse(uri)
self.bucketname = u.hostname
self.access_key = u.username or access_key
self.secret_key = u.password or secret_key
self.is_botocore = is_botocore()
self.keyname = u.path[1:] # remove first "/"
self.acl = acl
if self.is_botocore:
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
else:
import boto
self.connect_s3 = boto.connect_s3
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3', aws_access_key_id=self.access_key,
aws_secret_access_key=self.secret_key)
if feed_options and feed_options.get('overwrite', True) is False:
logger.warning('S3 does not support appending to files. To '
'suppress this warning, remove the overwrite '
Expand All @@ -154,18 +151,10 @@ def from_crawler(cls, crawler, uri, *, feed_options=None):

def _store_in_thread(self, file):
file.seek(0)
if self.is_botocore:
kwargs = {'ACL': self.acl} if self.acl else {}
self.s3_client.put_object(
Bucket=self.bucketname, Key=self.keyname, Body=file,
**kwargs)
else:
conn = self.connect_s3(self.access_key, self.secret_key)
bucket = conn.get_bucket(self.bucketname, validate=False)
key = bucket.new_key(self.keyname)
kwargs = {'policy': self.acl} if self.acl else {}
key.set_contents_from_file(file, **kwargs)
key.close()
kwargs = {'ACL': self.acl} if self.acl else {}
self.s3_client.put_object(
Bucket=self.bucketname, Key=self.keyname, Body=file,
**kwargs)
file.close()


Expand Down
97 changes: 32 additions & 65 deletions scrapy/pipelines/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
import time
from collections import defaultdict
from contextlib import suppress
from email.utils import mktime_tz, parsedate_tz
from ftplib import FTP
from io import BytesIO
from urllib.parse import urlparse
Expand All @@ -23,7 +22,7 @@
from scrapy.http import Request
from scrapy.pipelines.media import MediaPipeline
from scrapy.settings import Settings
from scrapy.utils.boto import is_botocore
from scrapy.utils.boto import is_botocore_available
from scrapy.utils.datatypes import CaselessDict
from scrapy.utils.ftp import ftp_store_file
from scrapy.utils.log import failure_to_exc_info
Expand Down Expand Up @@ -91,86 +90,54 @@ class S3FilesStore:
}

def __init__(self, uri):
self.is_botocore = is_botocore()
if self.is_botocore:
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3',
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
endpoint_url=self.AWS_ENDPOINT_URL,
region_name=self.AWS_REGION_NAME,
use_ssl=self.AWS_USE_SSL,
verify=self.AWS_VERIFY
)
else:
from boto.s3.connection import S3Connection
self.S3Connection = S3Connection
if not is_botocore_available():
raise NotConfigured('missing botocore library')
import botocore.session
session = botocore.session.get_session()
self.s3_client = session.create_client(
's3',
aws_access_key_id=self.AWS_ACCESS_KEY_ID,
aws_secret_access_key=self.AWS_SECRET_ACCESS_KEY,
endpoint_url=self.AWS_ENDPOINT_URL,
region_name=self.AWS_REGION_NAME,
use_ssl=self.AWS_USE_SSL,
verify=self.AWS_VERIFY
)
if not uri.startswith("s3://"):
raise ValueError(f"Incorrect URI scheme in {uri}, expected 's3'")
self.bucket, self.prefix = uri[5:].split('/', 1)

def stat_file(self, path, info):
def _onsuccess(boto_key):
if self.is_botocore:
checksum = boto_key['ETag'].strip('"')
last_modified = boto_key['LastModified']
modified_stamp = time.mktime(last_modified.timetuple())
else:
checksum = boto_key.etag.strip('"')
last_modified = boto_key.last_modified
modified_tuple = parsedate_tz(last_modified)
modified_stamp = int(mktime_tz(modified_tuple))
checksum = boto_key['ETag'].strip('"')
last_modified = boto_key['LastModified']
modified_stamp = time.mktime(last_modified.timetuple())
return {'checksum': checksum, 'last_modified': modified_stamp}

return self._get_boto_key(path).addCallback(_onsuccess)

def _get_boto_bucket(self):
# disable ssl (is_secure=False) because of this python bug:
# https://bugs.python.org/issue5103
c = self.S3Connection(self.AWS_ACCESS_KEY_ID, self.AWS_SECRET_ACCESS_KEY, is_secure=False)
return c.get_bucket(self.bucket, validate=False)

def _get_boto_key(self, path):
key_name = f'{self.prefix}{path}'
if self.is_botocore:
return threads.deferToThread(
self.s3_client.head_object,
Bucket=self.bucket,
Key=key_name)
else:
b = self._get_boto_bucket()
return threads.deferToThread(b.get_key, key_name)
return threads.deferToThread(
self.s3_client.head_object,
Bucket=self.bucket,
Key=key_name)

def persist_file(self, path, buf, info, meta=None, headers=None):
"""Upload file to S3 storage"""
key_name = f'{self.prefix}{path}'
buf.seek(0)
if self.is_botocore:
extra = self._headers_to_botocore_kwargs(self.HEADERS)
if headers:
extra.update(self._headers_to_botocore_kwargs(headers))
return threads.deferToThread(
self.s3_client.put_object,
Bucket=self.bucket,
Key=key_name,
Body=buf,
Metadata={k: str(v) for k, v in (meta or {}).items()},
ACL=self.POLICY,
**extra)
else:
b = self._get_boto_bucket()
k = b.new_key(key_name)
if meta:
for metakey, metavalue in meta.items():
k.set_metadata(metakey, str(metavalue))
h = self.HEADERS.copy()
if headers:
h.update(headers)
return threads.deferToThread(
k.set_contents_from_string, buf.getvalue(),
headers=h, policy=self.POLICY)
extra = self._headers_to_botocore_kwargs(self.HEADERS)
if headers:
extra.update(self._headers_to_botocore_kwargs(headers))
return threads.deferToThread(
self.s3_client.put_object,
Bucket=self.bucket,
Key=key_name,
Body=buf,
Metadata={k: str(v) for k, v in (meta or {}).items()},
ACL=self.POLICY,
**extra)

def _headers_to_botocore_kwargs(self, headers):
""" Convert headers to botocore keyword agruments.
Expand Down
23 changes: 22 additions & 1 deletion scrapy/utils/boto.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,32 @@
"""Boto/botocore helpers"""
import warnings

from scrapy.exceptions import NotConfigured
from scrapy.exceptions import NotConfigured, ScrapyDeprecationWarning


def is_botocore():
""" Returns True if botocore is available, otherwise raises NotConfigured. Never returns False.
Previously, when boto was supported in addition to botocore, this returned False if boto was available
but botocore wasn't.
"""
message = (
'is_botocore() is deprecated and always returns True or raises an Exception, '
'so it cannot be used for checking if boto is available instead of botocore. '
'You can use scrapy.utils.boto.is_botocore_available() to check if botocore '
'is available.'
)
warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
try:
import botocore # noqa: F401
return True
except ImportError:
raise NotConfigured('missing botocore library')


def is_botocore_available():
try:
import botocore # noqa: F401
return True
except ImportError:
return False
29 changes: 9 additions & 20 deletions scrapy/utils/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,7 @@
from importlib import import_module
from twisted.trial.unittest import SkipTest

from scrapy.exceptions import NotConfigured
from scrapy.utils.boto import is_botocore
from scrapy.utils.boto import is_botocore_available


def assert_aws_environ():
Expand All @@ -29,29 +28,19 @@ def assert_gcs_environ():


def skip_if_no_boto():
try:
is_botocore()
except NotConfigured as e:
raise SkipTest(e)
if not is_botocore_available():
raise SkipTest('missing botocore library')


def get_s3_content_and_delete(bucket, path, with_key=False):
""" Get content from s3 key, and delete key afterwards.
"""
if is_botocore():
import botocore.session
session = botocore.session.get_session()
client = session.create_client('s3')
key = client.get_object(Bucket=bucket, Key=path)
content = key['Body'].read()
client.delete_object(Bucket=bucket, Key=path)
else:
import boto
# assuming boto=2.2.2
bucket = boto.connect_s3().get_bucket(bucket, validate=False)
key = bucket.get_key(path)
content = key.get_contents_as_string()
bucket.delete_key(path)
import botocore.session
session = botocore.session.get_session()
client = session.create_client('s3')
key = client.get_object(Bucket=bucket, Key=path)
content = key['Body'].read()
client.delete_object(Bucket=bucket, Key=path)
return (content, key) if with_key else content


Expand Down
Loading

0 comments on commit 5e99758

Please sign in to comment.