Skip to content

Commit

Permalink
Add example;Ready to publish
Browse files Browse the repository at this point in the history
Signed-off-by: Jack Deng <[email protected]>
  • Loading branch information
JackTheMico committed Aug 23, 2022
1 parent e0e7549 commit 422015e
Show file tree
Hide file tree
Showing 9 changed files with 286 additions and 54 deletions.
1 change: 1 addition & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ disable=raw-checker-failed,
R0903,
R0913,
R0916,
R0801
# Enable the message, report, category or checker with the given id(s). You can
# eithertoo-few-public-methods give multiple identifier separated by comma (,) or put this option
# multiple time (only on the command line, not in the configuration file where
Expand Down
74 changes: 74 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,79 @@ pip install ruia-peewee-async


```python
from peewee import CharField
from ruia import AttrField, Item, Response, Spider, TextField

from ruia_peewee_async import (RuiaPeeweeInsert, RuiaPeeweeUpdate, TargetDB,
after_start)

class DoubanItem(Item):
target_item = TextField(css_select="tr.item")
title = AttrField(css_select="a.nbg", attr="title")
url = AttrField(css_select="a.nbg", attr="href")

async def clean_title(self, value):
return value.strip()

class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]
# aiohttp_kwargs = {"proxy": "http://127.0.0.1:7890"}

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
yield RuiaPeeweeInsert(item.results) # default is MySQL
# yield RuiaPeeweeInsert(item.results, database=TargetDB.POSTGRES) # save to Postgresql
# yield RuiaPeeweeInsert(item.results, database=TargetDB.BOTH) # save to both MySQL and Postgresql

class DoubanUpdateSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
res = {}
res["title"] = item.results["title"]
res["url"] = "http://whatever.youwanttoupdate.com"
yield RuiaPeeweeUpdate(
res,
{"title": res["title"]},
database=TargetDB.POSTGRES, # default is MySQL
)

# Args for RuiaPeeweeUpdate
# data: A dict that's going to be updated in the database.
# query: A peewee query or a dict to search for the target data in database.
# database: The target database type.
# create_when_not_exists: If True, will create a record when data not exists. Default is True.
# only: A list or tuple of fields that should be updated.

mysql = {
"host": "127.0.0.1",
"port": 3306,
"user": "ruiamysql",
"password": "abc123",
"database": "ruiamysql",
"model": {
"table_name": "ruia_mysql",
"title": CharField(),
"url": CharField(),
},
}
postgres = {
"host": "127.0.0.1",
"port": 5432,
"user": "ruiapostgres",
"password": "abc123",
"database": "ruiapostgres",
"model": {
"table_name": "ruia_postgres",
"title": CharField(),
"url": CharField(),
},
}

if __name__ == "__main__":
DoubanSpider.start(after_start=after_start(mysql=mysql))
# DoubanSpider.start(after_start=after_start(postgres=postgres))
# DoubanSpider.start(after_start=after_start(mysql=mysql, postgres=postgres))
# DoubanUpdateSpider.start(after_start=after_start(mysql=mysql))
```
79 changes: 79 additions & 0 deletions examples/douban.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# -*- coding: utf-8 -*-
from peewee import CharField
from ruia import AttrField, Item, Response, Spider, TextField

from ruia_peewee_async import RuiaPeeweeInsert, RuiaPeeweeUpdate, TargetDB, after_start


class DoubanItem(Item):
target_item = TextField(css_select="tr.item")
title = AttrField(css_select="a.nbg", attr="title")
url = AttrField(css_select="a.nbg", attr="href")

async def clean_title(self, value):
return value.strip()


class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]
# aiohttp_kwargs = {"proxy": "http://127.0.0.1:7890"}

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
yield RuiaPeeweeInsert(item.results) # default is MySQL
# yield RuiaPeeweeInsert(item.results, database=TargetDB.POSTGRES) # save to Postgresql
# yield RuiaPeeweeInsert(item.results, database=TargetDB.BOTH) # save to both MySQL and Postgresql


class DoubanUpdateSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]

async def parse(self, response: Response):
async for item in DoubanItem.get_items(html=await response.text()):
res = {}
res["title"] = item.results["title"]
res["url"] = "http://whatever.youwanttoupdate.com"
yield RuiaPeeweeUpdate(
res,
{"title": res["title"]},
database=TargetDB.POSTGRES, # default is MySQL
)

# Args for RuiaPeeweeUpdate
# data: A dict that's going to be updated in the database.
# query: A peewee query or a dict to search for the target data in database.
# database: The target database type.
# create_when_not_exists: If True, will create a record when data not exists. Default is True.
# only: A list or tuple of fields that should be updated.


mysql = {
"host": "127.0.0.1",
"port": 3306,
"user": "ruiamysql",
"password": "abc123",
"database": "ruiamysql",
"model": {
"table_name": "ruia_mysql",
"title": CharField(),
"url": CharField(),
},
}
postgres = {
"host": "127.0.0.1",
"port": 5432,
"user": "ruiapostgres",
"password": "abc123",
"database": "ruiapostgres",
"model": {
"table_name": "ruia_postgres",
"title": CharField(),
"url": CharField(),
},
}

if __name__ == "__main__":
DoubanSpider.start(after_start=after_start(mysql=mysql))
# DoubanSpider.start(after_start=after_start(postgres=postgres))
# DoubanSpider.start(after_start=after_start(mysql=mysql, postgres=postgres))
# DoubanUpdateSpider.start(after_start=after_start(mysql=mysql))
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "ruia-peewee-async"
version = "0.1.0"
version = "1.0.0"
description = "A Ruia plugin that uses the peewee-async to store data to MySQL"
authors = ["Jack Deng <[email protected]>"]
license = "MIT"
Expand Down
69 changes: 68 additions & 1 deletion ruia_peewee_async/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ class ParameterError(Exception):

class RuiaPeeweeInsert:
def __init__(self, data: Dict, database: TargetDB = TargetDB.MYSQL) -> None:
"""
Args:
data: A data that's going to be inserted into the database.
database: The target database type.
"""

self.data = data
self.database = database

Expand Down Expand Up @@ -71,6 +79,17 @@ def __init__(
create_when_not_exists: bool = True,
only: Optional[Union[Tuple[str], List[str]]] = None,
) -> None:
"""
Args:
data: A dict that's going to be updated in the database.
query: A peewee query or a dict to search for the target data in database.
database: The target database type.
create_when_not_exists: If True, will create a record when data not exists. Default is True.
only: A list or tuple of fields that should be updated.
"""

self.data = data
self.query = query
self.database = database
Expand Down Expand Up @@ -179,7 +198,6 @@ def init_spider(*, spider_ins: Spider):
with spider_ins.postgres_manager.allow_sync():
spider_ins.postgres_model.create_table(True)
spider_ins.callback_result_map = spider_ins.callback_result_map or {}
# MySQL Insert
spider_ins.process_insert_callback_result = MethodType(
RuiaPeeweeInsert.process, spider_ins
)
Expand All @@ -192,3 +210,52 @@ def init_spider(*, spider_ins: Spider):
spider_ins.callback_result_map.update(
{"RuiaPeeweeUpdate": "process_update_callback_result"}
)


def raise_no_model(config, model, name):
if config and not model:
raise ParameterError(
f"""{name} must have 'model' in config and 'model' cannot be empty.
For example:
{{
'host': '127.0.0.1',
'port': 3306,
'user': 'ruiamysql',
'password': 'abc123',
'database': 'ruiamysql',
'model': {{
'table_name': 'ruia_mysql',
"title": CharField(),
'url': CharField(),
}},
}}
"""
)


def after_start(**kwargs):
if not kwargs:
raise ParameterError(
"There must be a 'mysql' or 'postgres' parameter or both of them."
)
mysql = kwargs.get("mysql", {})
postgres = kwargs.get("postgres", {})
if not mysql and not postgres:
raise ParameterError(
"MySQL and PostgreSQL configs cannout be empty at the same time."
)
mysql_model = mysql.pop("model", None)
postgres_model = postgres.pop("model", None)
raise_no_model(mysql, mysql_model, "MySQL")
raise_no_model(postgres, postgres_model, "PostgreSQL")

async def init_after_start(spider_ins):
if mysql and mysql_model:
spider_ins.mysql_config = mysql
spider_ins.mysql_model = mysql_model
if postgres and postgres_model:
spider_ins.postgres_config = postgres
spider_ins.postgres_model = postgres_model
init_spider(spider_ins=spider_ins)

return init_after_start
10 changes: 5 additions & 5 deletions tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ruia_peewee_async import RuiaPeeweeInsert, RuiaPeeweeUpdate, Spider, TargetDB


class HackerNewsItem(Item):
class DoubanItem(Item):
target_item = TextField(css_select="tr.item")
title = AttrField(css_select="a.nbg", attr="title")
url = AttrField(css_select="a.nbg", attr="href")
Expand All @@ -15,16 +15,16 @@ async def clean_title(self, value):
return value.strip()


class HackerNewsSpider(Spider):
class DoubanSpider(Spider):
start_urls = ["https://movie.douban.com/chart"]
# aiohttp_kwargs = {"proxy": "http://127.0.0.1:7890"}

async def parse(self, response: Response):
async for item in HackerNewsItem.get_items(html=await response.text()):
async for item in DoubanItem.get_items(html=await response.text()):
yield item


class Insert(HackerNewsSpider):
class Insert(DoubanSpider):
def __init__(
self,
middleware: typing.Union[typing.Iterable, Middleware] = None,
Expand All @@ -44,7 +44,7 @@ async def parse(self, response):
yield RuiaPeeweeInsert(item.results, database=self.target_db)


class Update(HackerNewsSpider):
class Update(DoubanSpider):
def __init__(
self,
middleware: typing.Union[typing.Iterable, Middleware] = None,
Expand Down
45 changes: 26 additions & 19 deletions tests/test_both.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pytest
from peewee import CharField

from ruia_peewee_async import TargetDB, init_spider
from ruia_peewee_async import TargetDB, after_start

from .common import Insert, Update

Expand All @@ -22,30 +22,35 @@ async def parse(self, response):


def basic_setup(mysql, postgresql):
async def init_after_start(spider_ins):
spider_ins.mysql_config = mysql
spider_ins.mysql_model = {
"table_name": "ruia_mysql_both",
"title": CharField(),
"url": CharField(),
mysql.update(
{
"model": {
"table_name": "ruia_mysql",
"title": CharField(),
"url": CharField(),
}
}
spider_ins.postgres_config = postgresql
spider_ins.postgres_model = {
"table_name": "ruia_postgres_both",
"title": CharField(),
"url": CharField(),
)
postgresql.update(
{
"model": {
"table_name": "ruia_postgres",
"title": CharField(),
"url": CharField(),
}
}
init_spider(spider_ins=spider_ins)

return init_after_start
)
return mysql, postgresql


class TestBoth:
@pytest.mark.dependency()
async def test_both_insert(self, mysql, postgresql, event_loop):
after_start = basic_setup(mysql, postgresql)
mysql, postgresql = basic_setup(mysql, postgresql)
spider_ins = await BothInsert.async_start(
loop=event_loop, after_start=after_start, target_db=TargetDB.BOTH
loop=event_loop,
after_start=after_start(mysql=mysql, postgres=postgresql),
target_db=TargetDB.BOTH,
)
count_mysql = await spider_ins.mysql_manager.count(
spider_ins.mysql_model.select()
Expand All @@ -58,9 +63,11 @@ async def test_both_insert(self, mysql, postgresql, event_loop):

@pytest.mark.dependency(depends=["TestBoth::test_both_insert"])
async def test_both_update(self, mysql, postgresql, event_loop):
after_start = basic_setup(mysql, postgresql)
mysql, postgresql = basic_setup(mysql, postgresql)
spider_ins = await BothUpdate.async_start(
loop=event_loop, after_start=after_start, target_db=TargetDB.BOTH
loop=event_loop,
after_start=after_start(mysql=mysql, postgres=postgresql),
target_db=TargetDB.BOTH,
)
mysql_one = await spider_ins.mysql_manager.get(
spider_ins.mysql_model, id=randint(1, 11)
Expand Down
Loading

0 comments on commit 422015e

Please sign in to comment.