Skip to content

Commit

Permalink
♻️ json爬虫优化解析方式
Browse files Browse the repository at this point in the history
  • Loading branch information
Blushyes committed Jan 12, 2024
1 parent 7534ca7 commit 2e85d03
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 26 deletions.
56 changes: 33 additions & 23 deletions src/crawler.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
from functools import reduce
import requests
from lxml import etree
from typing import Any, Collection, Callable
from .sweet import import_from_string
from typing import Collection, Callable
from .sweet import extract_json_values_by_path, import_from_string


def html_xpath_crawler(url: str, xpath: str, **kwargs) -> Any:
"""基于HTML和XPATH的简单页面爬虫
def html_xpath_crawler(url: str, xpath: str, **kwargs) -> any:
"""基于html和xpaTH的简单页面爬虫
Args:
**kwargs: 这个为requests.get()的额外参数比如header
Expand All @@ -27,32 +26,33 @@ def json_crawler(
method: str = "get",
getter: None
| str
| dict
| Collection[str | int]
| Collection[Collection[str | int]]
| Callable[[dict], any] = None,
**kwargs
) -> Any:
**kwargs,
) -> any:
"""
从给定的URL抓取JSON数据,并可以选择性地提取特定的数据
从给定的URL抓取JSON数据,并可以选择性地提取特定的数据
使用指定的HTTP方法从给定的URL请求数据,将返回的内容解析为JSON,
并且根据提供的`getter`来检索JSON内部的特定数据
并且根据提供的`getter`来检索JSON内部的特定数据
Args:
getter: 用来从返回的JSON数据中提取特定数据的参数。此参数有多种形式:
- 如果是None,将返回整个JSON
- 如果是字符串,会被看做是JSON对象内的属性路径,用'.'分隔(例如"results.0.name")
- 如果是字符串或整数的集合,会依序检索每个键或索引(例如['results', 0, 'name'])
- 如果是集合的集合,会返回一个列表,其中包含通过每个子集合检索的数据(例如[['results', 0, 'name'], ['info', 'count']])
- 如果是函数,将以整个JSON对象为参数调用此函数,并返回结果
**kwargs: 传递给请求函数的额外关键字参数
- 如果是None,将返回整个JSON
- 如果是字符串,会被看做是JSON对象内的属性路径,用'.'分隔(例如"results.0.name")
- 如果是字符串或整数的集合,会依序检索每个键或索引(例如['results', 0, 'name'])
- 如果是集合的集合,会返回一个列表,其中包含通过每个子集合检索的数据(例如[['results', 0, 'name'], ['info', 'count']])
- 如果是函数,将以整个JSON对象为参数调用此函数,并返回结果
**kwargs: 传递给请求函数的额外关键字参数
Returns:
Any: 如果未指定`getter`,返回的是整个JSON对象。如果指定了`getter`,返回的类型取决于`getter`的行为
Any: 如果未指定`getter`,返回的是整个JSON对象。如果指定了`getter`,返回的类型取决于`getter`的行为
Raises:
KeyError: 如果`getter`指定的路径不存在,会抛出KeyError
RequestException: 如果HTTP请求失败,会抛出requests库的RequestException
KeyError: 如果`getter`指定的路径不存在,会抛出KeyError
RequestException: 如果HTTP请求失败,会抛出requests库的RequestException
Example:
# 获取整个JSON数据
Expand All @@ -67,11 +67,22 @@ def json_crawler(
# 处理列表中不同路径的数据
multiple_values = json_crawler("https://example.com/api/data", getter=[['results', 0], ['info']])
# 处理并收集数据为dict
collected_values= json_crawler("https://example.com/api/data", getter={('results', 0): 'k1', ('info'): 'k2'})
# 使用自定义函数来处理数据
def custom_process(json_data):
return [item['name'] for item in json_data['results']]
processed_data = json_crawler("https://example.com/api/data", getter=custom_process)
"""

def extract_by_path_getter(data, path):
if all(isinstance(it, str) or isinstance(it, int) for it in path):
return extract_json_values_by_path(data, *path)

if all(isinstance(it, Collection) for it in path):
return [extract_json_values_by_path(data, *it) for it in path]

request_func = import_from_string("requests", method)
res = request_func(url, **kwargs).json()
if getter is None:
Expand All @@ -80,12 +91,11 @@ def custom_process(json_data):
if isinstance(getter, str):
getter = getter.split(".")

if isinstance(getter, Collection):
if all(isinstance(it, str) or isinstance(it, int) for it in getter):
return reduce(lambda d, key: d[key], getter, res)
if isinstance(getter, dict):
return {v: extract_by_path_getter(res, k) for k, v in getter.items()}

if all(isinstance(it, Collection) for it in getter):
return [reduce(lambda d, key: d[key], it, res) for it in getter]
if isinstance(getter, Collection):
return extract_by_path_getter(res, getter)

if isinstance(getter, Callable):
return getter(res)
6 changes: 3 additions & 3 deletions src/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,13 @@ def list_files_recursive(
directory: str = os.path.curdir,
exclude: list[str] | str = None,
file_filter: Callable[[FileInfo], list[FileInfo]] = None,
safely=False,
safely=True,
) -> list[FileInfo]:
"""递归列出所有的文件,默认为当前目录下面所有文件
Args:
exclude: 排除哪个或者哪些文件,只是简单地排除某个文件名或者目录名
file_filter: 过滤器,可以更自由地定义排除哪些文件
exclude: 排除哪个或者哪些文件,只是简单地排除某个文件名或者目录名
file_filter: 过滤器,可以更自由地定义排除哪些文件
safely: 如果为 true 则遇见异常的时候不会报错
Returns:
Expand Down
37 changes: 37 additions & 0 deletions src/sweet.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import json
from typing import Collection, Callable
from importlib import import_module

Expand All @@ -12,3 +13,39 @@ def import_from_string(module_name: str, func_name: str) -> Callable:
module = import_module(module_name)
func = getattr(module, func_name)
return func


def extract_json_values_by_path(data: dict | str, *path, safely: bool = True):
"""根据路径获取JSON的值"""
if isinstance(data, str):
data = json.loads(data)

# 定义一个递归函数来遍历给定路径
def traverse(data, *keys):
# 如果已经没有更多的key,返回当前的数据
if not keys:
return data

# 获取当前的keys中的一个
current_key, *remaining_keys = keys

# 如果当前的key存在于数据中
if isinstance(data, list) and current_key.isdigit():
# 假设当前的key是数字,意味着它是列表中的索引
current_index = int(current_key)
return traverse(data[current_index], *remaining_keys)
elif isinstance(data, list):
# 如果数据本身是列表,则遍历每个元素
return [traverse(elem, *keys) for elem in data]
elif isinstance(data, dict) and current_key in data:
# 如果数据是字典,则直接进入下一个层级
return traverse(data[current_key], *remaining_keys)
else:
# 如果路径不正确,返回None或抛出异常
if safely:
return None

raise KeyError("JSON路径有错!")

# 从JSON数据的根部开始遍历
return traverse(data, *path)

0 comments on commit 2e85d03

Please sign in to comment.