null处理提供选项

wukan1986 · Dec 19, 2024 · 64a2d2a · 64a2d2a
1 parent d231fad
commit 64a2d2a
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -140,15 +140,20 @@ df = codegen_exec(df.lazy(), _code_block_1, _code_block_2).collect(engine="gpu")
 https://github.com/pola-rs/polars/issues/12925#issuecomment-2552764629
 非常棒的点子，总结下来有两种实现方式：
 
-1. 将`null`分成一组，`not_null`分成另一组，要计算两次
-2. 仅一组，但复合排序，将`null`排在前面，`not_null`排后面，只要计算一次
+1. 将`null`分成一组，`not_null`分成另一组。要计算两次
+2. 仅一组，但复合排序，将`null`排在前面，`not_null`排后面。只计算一次，略快一些
 
 ```python
 X1 = (ts_returns(CLOSE, 3)).over(CLOSE.is_not_null(), _ASSET_, order_by=_DATE_),
 X2 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=[CLOSE.is_not_null(), _DATE_]),
+X3 = (ts_returns(CLOSE, 3)).over(_ASSET_, order_by=_DATE_),
 ```
 
-目前使用的是第2种
+第2种开头的`null`区域，是否影响结果由算子所决定，特别时是多列输入`null`区域可能有数据
+
+1. `over_null='partition_by'`。分到两个区域
+2. `over_null='order_by'`。分到一个区域，`null`排在前面
+3. `over_null=None`。不处理，直接计算，速度更快
 
 ## 二次开发
 

diff --git a/expr_codegen/pandas/code.py b/expr_codegen/pandas/code.py
@@ -38,7 +38,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
             filename='template.py.j2',
             date='date', asset='asset',
             alias: Dict[str, str] = {},
-            extra_codes: Sequence[str] = ()):
+            extra_codes: Sequence[str] = (),
+            **kwargs):
     """基于模板的代码生成"""
     # 打印Pandas风格代码
     p = PandasStrPrinter()

diff --git a/expr_codegen/polars_group/code.py b/expr_codegen/polars_group/code.py
@@ -39,7 +39,8 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
             filename='template.py.j2',
             date='date', asset='asset',
             alias: Dict[str, str] = {},
-            extra_codes: Sequence[str] = ()):
+            extra_codes: Sequence[str] = (),
+            **kwargs):
     """基于模板的代码生成"""
     # 打印Polars风格代码
     p = PolarsStrPrinter()

diff --git a/expr_codegen/polars_over/code.py b/expr_codegen/polars_over/code.py
@@ -1,5 +1,5 @@
 import os
-from typing import Sequence, Dict
+from typing import Sequence, Dict, Literal
 
 import jinja2
 from jinja2 import FileSystemLoader, TemplateNotFound
@@ -39,7 +39,9 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
             filename='template.py.j2',
             date='date', asset='asset',
             alias: Dict[str, str] = {},
-            extra_codes: Sequence[str] = ()):
+            extra_codes: Sequence[str] = (),
+            over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
+            **kwargs):
     """基于模板的代码生成"""
     # 打印Polars风格代码
     p = PolarsStrPrinter()
@@ -82,8 +84,12 @@ def codegen(exprs_ldl: ListDictList, exprs_src, syms_dst,
                             _sym = f"pl.all_horizontal({','.join(_sym)})"
                         else:
                             _sym = ','.join(_sym)
-                        # func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
-                        func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
+                        if over_null == 'partition_by':
+                            func_code.append(f"{va}=({s2}).over({_sym}, _ASSET_, order_by=_DATE_),")
+                        elif over_null == 'order_by':
+                            func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=[{_sym}, _DATE_]),")
+                        else:
+                            func_code.append(f"{va}=({s2}).over(_ASSET_, order_by=_DATE_),")
                     elif k[0] == CS:
                         func_code.append(f"{va}=({s2}).over(_DATE_),")
                     elif k[0] == GP:

diff --git a/expr_codegen/tool.py b/expr_codegen/tool.py
@@ -193,7 +193,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
             replace: bool = True, regroup: bool = False, format: bool = True,
             date='date', asset='asset',
             alias: Dict[str, str] = {},
-            extra_codes: Sequence[object] = ()):
+            extra_codes: Sequence[object] = (),
+            **kwargs):
         """功能集成版，将几个功能写到一起方便使用
 
         Parameters
@@ -252,7 +253,8 @@ def all(self, exprs_src, style: Literal['pandas', 'polars_group', 'polars_over']
         codes = codegen(exprs_ldl, exprs_src, syms_dst,
                         filename=template_file, date=date, asset=asset,
                         alias=alias,
-                        extra_codes=extra_codes)
+                        extra_codes=extra_codes,
+                        **kwargs)
 
         if format:
             # 格式化。在遗传算法中没有必要
@@ -267,7 +269,8 @@ def _get_code(self,
                   output_file: str,
                   convert_xor: bool,
                   style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over', template_file: str = 'template.py.j2',
-                  date: str = 'date', asset: str = 'asset') -> str:
+                  date: str = 'date', asset: str = 'asset',
+                  **kwargs) -> str:
         """通过字符串生成代码， 加了缓存，多次调用不重复生成"""
         raw, exprs_dict = sources_to_exprs(self.globals_, source, *more_sources, convert_xor=convert_xor)
 
@@ -279,7 +282,8 @@ def _get_code(self,
                              extra_codes=(raw,
                                           # 传入多个列的方法
                                           extra_codes,
-                                          ))
+                                          ),
+                             **kwargs)
 
         # 移回到cache，防止多次调用多次保存
         if isinstance(output_file, TextIOWrapper):
@@ -324,7 +328,8 @@ def codegen_exec(df: Optional[DataFrame],
                  style: Literal['pandas', 'polars_group', 'polars_over'] = 'polars_over',
                  template_file: str = 'template.py.j2',
                  date: str = 'date', asset: str = 'asset',
-                 ) -> Optional[DataFrame]:
+                 over_null: Literal['order_by', 'partition_by', None] = 'partition_by',
+                 **kwargs) -> Optional[DataFrame]:
     """快速转换源代码并执行
 
     Parameters
@@ -355,6 +360,11 @@ def codegen_exec(df: Optional[DataFrame],
         时间字段
     asset: str
         资产字段
+    over_null: str
+        时序中遇到null时的处理方式
+        - order_by: 空值排同一分区的前排
+        - partition_by: 空值划分到不同分区
+        - None: 不做处理
 
     Returns
     -------
@@ -391,6 +401,8 @@ def codegen_exec(df: Optional[DataFrame],
         convert_xor=convert_xor,
         style=style, template_file=template_file,
         date=date, asset=asset,
+        over_null=over_null,
+        **kwargs
     )
 
     if df is None: