Skip to content

Commit

Permalink
null处理提供选项
Browse files Browse the repository at this point in the history
  • Loading branch information
wukan1986 committed Dec 19, 2024
1 parent d231fad commit 64a2d2a
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 14 deletions.
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,20 @@ df = codegen_exec(df.lazy(), _code_block_1, _code_block_2).collect(engine="gpu")
https://github.com/pola-rs/polars/issues/12925#issuecomment-2552764629
非常棒的点子,总结下来有两种实现方式:

1.`null`分成一组,`not_null`分成另一组要计算两次
2. 仅一组,但复合排序,将`null`排在前面,`not_null`排后面,只要计算一次
1.`null`分成一组,`not_null`分成另一组要计算两次
2. 仅一组,但复合排序,将`null`排在前面,`not_null`排后面。只计算一次,略快一些

```python
X1 = (ts_returns(CLOSE, 3)).over(CLOSE.is_not_null(), _ASSET_, order_by=_DATE_),
X2 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=[CLOSE.is_not_null(), _DATE_]),
X3 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=_DATE_),
```

目前使用的是第2种
第2种开头的`null`区域,是否影响结果由算子所决定,特别时是多列输入`null`区域可能有数据

1. `over_null='partition_by'`。分到两个区域
2. `over_null='order_by'`。分到一个区域,`null`排在前面
3. `over_null=None`。不处理,直接计算,速度更快

## 二次开发

Expand Down
3 changes: 2 additions & 1 deletion expr_codegen/pandas/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
filename='template.py.j2',
date='date', asset='asset',
alias: Dict[str, str] = {},
extra_codes: Sequence[str] = ()):
extra_codes: Sequence[str] = (),
**kwargs):
"""基于模板的代码生成"""
# 打印Pandas风格代码
p = PandasStrPrinter()
Expand Down
3 changes: 2 additions & 1 deletion expr_codegen/polars_group/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
filename='template.py.j2',
date='date', asset='asset',
alias: Dict[str, str] = {},
extra_codes: Sequence[str] = ()):
extra_codes: Sequence[str] = (),
**kwargs):
"""基于模板的代码生成"""
# 打印Polars风格代码
p = PolarsStrPrinter()
Expand Down
14 changes: 10 additions & 4 deletions expr_codegen/polars_over/code.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from typing import Sequence, Dict
from typing import Sequence, Dict, Literal

import jinja2
from jinja2 import FileSystemLoader, TemplateNotFound
Expand Down Expand Up @@ -39,7 +39,9 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
filename='template.py.j2',
date='date', asset='asset',
alias: Dict[str, str] = {},
extra_codes: Sequence[str] = ()):
extra_codes: Sequence[str] = (),
over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
**kwargs):
"""基于模板的代码生成"""
# 打印Polars风格代码
p = PolarsStrPrinter()
Expand Down Expand Up @@ -82,8 +84,12 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
_sym = f"pl.all_horizontal({','.join(_sym)})"
else:
_sym = ','.join(_sym)
# func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
if over_null == 'partition_by':
func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
elif over_null == 'order_by':
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
else:
func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=_DATE_),")
elif k[0] == CS:
func_code.append(f"{va}=({s2}).over(_DATE_),")
elif k[0] == GP:
Expand Down
22 changes: 17 additions & 5 deletions expr_codegen/tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
replace: bool = True, regroup: bool = False, format: bool = True,
date='date', asset='asset',
alias: Dict[str, str] = {},
extra_codes: Sequence[object] = ()):
extra_codes: Sequence[object] = (),
**kwargs):
"""功能集成版,将几个功能写到一起方便使用
Parameters
Expand Down Expand Up @@ -252,7 +253,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
codes = codegen(exprs_ldl, exprs_src, syms_dst,
filename=template_file, date=date, asset=asset,
alias=alias,
extra_codes=extra_codes)
extra_codes=extra_codes,
**kwargs)

if format:
# 格式化。在遗传算法中没有必要
Expand All @@ -267,7 +269,8 @@ def _get_code(self,
output_file: str,
convert_xor: bool,
style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over', template_file: str = 'template.py.j2',
date: str = 'date', asset: str = 'asset') -> str:
date: str = 'date', asset: str = 'asset',
**kwargs) -> str:
"""通过字符串生成代码, 加了缓存,多次调用不重复生成"""
raw, exprs_dict = sources_to_exprs(self.globals_, source, *more_sources, convert_xor=convert_xor)

Expand All @@ -279,7 +282,8 @@ def _get_code(self,
extra_codes=(raw,
# 传入多个列的方法
extra_codes,
))
),
**kwargs)

# 移回到cache,防止多次调用多次保存
if isinstance(output_file, TextIOWrapper):
Expand Down Expand Up @@ -324,7 +328,8 @@ def codegen_exec(df: Optional[DataFrame],
style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over',
template_file: str = 'template.py.j2',
date: str = 'date', asset: str = 'asset',
) -> Optional[DataFrame]:
over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
**kwargs) -> Optional[DataFrame]:
"""快速转换源代码并执行
Parameters
Expand Down Expand Up @@ -355,6 +360,11 @@ def codegen_exec(df: Optional[DataFrame],
时间字段
asset: str
资产字段
over_null: str
时序中遇到null时的处理方式
- order_by: 空值排同一分区的前排
- partition_by: 空值划分到不同分区
- None: 不做处理
Returns
-------
Expand Down Expand Up @@ -391,6 +401,8 @@ def codegen_exec(df: Optional[DataFrame],
convert_xor=convert_xor,
style=style, template_file=template_file,
date=date, asset=asset,
over_null=over_null,
**kwargs
)

if df is None:
Expand Down

0 comments on commit 64a2d2a

Please sign in to comment.