Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce UnknownSeries and UnknownIndex, type core.strings.pyi using them #1146

Merged
merged 40 commits into from
Mar 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
0d48f36
make typing in pandas_stubs.core.strings.pyi strict, add UnknownSerie…
MarcoGorelli Mar 6, 2025
ca10bf2
undo pyproject.toml changes
MarcoGorelli Mar 6, 2025
4b8183d
use class, use pyright: strict
MarcoGorelli Mar 6, 2025
def6eea
update pyright
MarcoGorelli Mar 6, 2025
9c5b33a
reduce diff
MarcoGorelli Mar 6, 2025
9b63e3f
fixup
MarcoGorelli Mar 6, 2025
fd6188a
fixup
MarcoGorelli Mar 6, 2025
bcdd40e
include UnknownSeries in str.cat
MarcoGorelli Mar 6, 2025
dba1bda
move UnknownSeries and UnknownIndex location
MarcoGorelli Mar 6, 2025
6a31e87
use typealias
MarcoGorelli Mar 7, 2025
5edf982
use Series[str] as .cat return type
MarcoGorelli Mar 7, 2025
9a47508
use -> T so it matches other .str methods like .str.uppercase
MarcoGorelli Mar 7, 2025
0fabb99
use _TS2 for findall
MarcoGorelli Mar 7, 2025
427a707
add test to cover passing UnknownSeries to cat
MarcoGorelli Mar 7, 2025
de28385
preserve type in series.str
MarcoGorelli Mar 7, 2025
e40d245
simplify
MarcoGorelli Mar 7, 2025
92dc75d
use Mapping instead of dict as it is invariant
MarcoGorelli Mar 7, 2025
231b54d
fixup
MarcoGorelli Mar 7, 2025
45b8da0
split out into separate file
MarcoGorelli Mar 8, 2025
385b1bd
split out into separate file
MarcoGorelli Mar 8, 2025
412b1ab
type check boolean return values
MarcoGorelli Mar 8, 2025
2463ce9
integer return type
MarcoGorelli Mar 8, 2025
b0cade6
integer return type
MarcoGorelli Mar 8, 2025
29710a4
strings and bytes
MarcoGorelli Mar 8, 2025
3298868
list
MarcoGorelli Mar 8, 2025
5dfa7fa
expanding
MarcoGorelli Mar 8, 2025
3d581a8
fixup
MarcoGorelli Mar 8, 2025
005759c
keep fixing
MarcoGorelli Mar 8, 2025
aca32d5
keep fixing
MarcoGorelli Mar 8, 2025
b244308
overloads cat
MarcoGorelli Mar 8, 2025
0d1fc59
fixup str.extract
MarcoGorelli Mar 8, 2025
7ccfa0d
rename for clarity
MarcoGorelli Mar 8, 2025
b4839a0
lint
MarcoGorelli Mar 8, 2025
17e280f
annotate idx2 as per mypys request
MarcoGorelli Mar 8, 2025
208a55c
return _T_STR, except for `slice` because that one preserves the inpu…
MarcoGorelli Mar 10, 2025
3dc660e
mypy fixup
MarcoGorelli Mar 10, 2025
b2d4657
disallow .str on certain series types
Dr-Irv Mar 10, 2025
ce7575e
Revert "disallow .str on certain series types"
MarcoGorelli Mar 11, 2025
98cb162
Merge remote-tracking branch 'upstream/main' into strict-strings-typing
MarcoGorelli Mar 11, 2025
3e24de0
use Index of list[str]
MarcoGorelli Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion pandas-stubs/core/indexes/base.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ from typing import (
Any,
ClassVar,
Literal,
TypeAlias,
final,
overload,
)
Expand Down Expand Up @@ -263,7 +264,16 @@ class Index(IndexOpsMixin[S1]):
@property
def str(
self,
) -> StringMethods[Self, MultiIndex, np_ndarray_bool, Index[list[str]]]: ...
) -> StringMethods[
Self,
MultiIndex,
np_ndarray_bool,
Index[list[str]],
Index[int],
Index[bytes],
Index[str],
Index[type[object]],
]: ...
def is_(self, other) -> bool: ...
def __len__(self) -> int: ...
def __array__(self, dtype=...) -> np.ndarray: ...
Expand Down Expand Up @@ -455,6 +465,8 @@ class Index(IndexOpsMixin[S1]):
),
) -> Self: ...

UnknownIndex: TypeAlias = Index[Any]

def ensure_index_from_sequences(
sequences: Sequence[Sequence[Dtype]], names: list[str] = ...
) -> Index: ...
Expand Down
13 changes: 12 additions & 1 deletion pandas-stubs/core/series.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1179,7 +1179,16 @@ class Series(IndexOpsMixin[S1], NDFrame):
@property
def str(
self,
) -> StringMethods[Series, DataFrame, Series[bool], Series[list[str]]]: ...
) -> StringMethods[
Self,
DataFrame,
Series[bool],
Series[list[str]],
Series[int],
Series[bytes],
Series[str],
Series[type[object]],
]: ...
@property
def dt(self) -> CombinedDatetimelikeProperties: ...
@property
Expand Down Expand Up @@ -2318,3 +2327,5 @@ class IntervalSeries(Series[Interval[_OrderableT]], Generic[_OrderableT]):
@property
def array(self) -> IntervalArray: ...
def diff(self, periods: int = ...) -> Never: ...

UnknownSeries: TypeAlias = Series[Any]
177 changes: 99 additions & 78 deletions pandas-stubs/core/strings.pyi
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# pyright: strict
from collections.abc import (
Callable,
Sequence,
Expand All @@ -12,6 +13,7 @@ from typing import (
)

import numpy as np
import numpy.typing as npt
import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -21,23 +23,36 @@ from pandas import (
)
from pandas.core.base import NoNewAttributesMixin

from pandas._libs.tslibs.nattype import NaTType
from pandas._typing import (
JoinHow,
Scalar,
T,
np_ndarray_bool,
)

# The _TS type is what is used for the result of str.split with expand=True
_TS = TypeVar("_TS", bound=DataFrame | MultiIndex)
# The _TS2 type is what is used for the result of str.split with expand=False
_TS2 = TypeVar("_TS2", bound=Series[list[str]] | Index[list[str]])
# The _TM type is what is used for the result of str.match
_TM = TypeVar("_TM", bound=Series[bool] | np_ndarray_bool)
# Used for the result of str.split with expand=True
_T_EXPANDING = TypeVar("_T_EXPANDING", bound=DataFrame | MultiIndex)
# Used for the result of str.split with expand=False
_T_LIST_STR = TypeVar("_T_LIST_STR", bound=Series[list[str]] | Index[list[str]])
# Used for the result of str.match
_T_BOOL = TypeVar("_T_BOOL", bound=Series[bool] | np_ndarray_bool)
# Used for the result of str.index / str.find
_T_INT = TypeVar("_T_INT", bound=Series[int] | Index[int])
# Used for the result of str.encode
_T_BYTES = TypeVar("_T_BYTES", bound=Series[bytes] | Index[bytes])
# Used for the result of str.decode
_T_STR = TypeVar("_T_STR", bound=Series[str] | Index[str])
# Used for the result of str.partition
_T_OBJECT = TypeVar("_T_OBJECT", bound=Series[type[object]] | Index[type[object]])

class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
class StringMethods(
NoNewAttributesMixin,
Generic[T, _T_EXPANDING, _T_BOOL, _T_LIST_STR, _T_INT, _T_BYTES, _T_STR, _T_OBJECT],
):
def __init__(self, data: T) -> None: ...
def __getitem__(self, key: slice | int) -> T: ...
def __iter__(self) -> T: ...
def __getitem__(self, key: slice | int) -> _T_STR: ...
def __iter__(self) -> _T_STR: ...
@overload
def cat(
self,
Expand All @@ -58,15 +73,17 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
@overload
def cat(
self,
others: Series | pd.Index | pd.DataFrame | np.ndarray | list[Any],
others: (
Series[str] | Index[str] | pd.DataFrame | npt.NDArray[np.str_] | list[str]
),
sep: str = ...,
na_rep: str | None = ...,
join: JoinHow = ...,
) -> T: ...
) -> _T_STR: ...
@overload
def split(
self, pat: str = ..., *, n: int = ..., expand: Literal[True], regex: bool = ...
) -> _TS: ...
) -> _T_EXPANDING: ...
@overload
def split(
self,
Expand All @@ -75,77 +92,79 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
n: int = ...,
expand: Literal[False] = ...,
regex: bool = ...,
) -> _TS2: ...
) -> _T_LIST_STR: ...
@overload
def rsplit(self, pat: str = ..., *, n: int = ..., expand: Literal[True]) -> _TS: ...
def rsplit(
self, pat: str = ..., *, n: int = ..., expand: Literal[True]
) -> _T_EXPANDING: ...
@overload
def rsplit(
self, pat: str = ..., *, n: int = ..., expand: Literal[False] = ...
) -> _TS2: ...
) -> _T_LIST_STR: ...
@overload
def partition(self, sep: str = ...) -> pd.DataFrame: ...
def partition(self, sep: str = ...) -> _T_EXPANDING: ...
@overload
def partition(self, *, expand: Literal[True]) -> pd.DataFrame: ...
def partition(self, *, expand: Literal[True]) -> _T_EXPANDING: ...
@overload
def partition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
def partition(self, sep: str, expand: Literal[True]) -> _T_EXPANDING: ...
@overload
def partition(self, sep: str, expand: Literal[False]) -> T: ...
def partition(self, sep: str, expand: Literal[False]) -> _T_OBJECT: ...
@overload
def partition(self, *, expand: Literal[False]) -> T: ...
def partition(self, *, expand: Literal[False]) -> _T_OBJECT: ...
@overload
def rpartition(self, sep: str = ...) -> pd.DataFrame: ...
def rpartition(self, sep: str = ...) -> _T_EXPANDING: ...
@overload
def rpartition(self, *, expand: Literal[True]) -> pd.DataFrame: ...
def rpartition(self, *, expand: Literal[True]) -> _T_EXPANDING: ...
@overload
def rpartition(self, sep: str, expand: Literal[True]) -> pd.DataFrame: ...
def rpartition(self, sep: str, expand: Literal[True]) -> _T_EXPANDING: ...
@overload
def rpartition(self, sep: str, expand: Literal[False]) -> T: ...
def rpartition(self, sep: str, expand: Literal[False]) -> _T_OBJECT: ...
@overload
def rpartition(self, *, expand: Literal[False]) -> T: ...
def get(self, i: int) -> T: ...
def join(self, sep: str) -> T: ...
def rpartition(self, *, expand: Literal[False]) -> _T_OBJECT: ...
def get(self, i: int) -> _T_STR: ...
def join(self, sep: str) -> _T_STR: ...
def contains(
self,
pat: str | re.Pattern,
pat: str | re.Pattern[str],
case: bool = ...,
flags: int = ...,
na=...,
na: Scalar | NaTType | None = ...,
regex: bool = ...,
) -> Series[bool]: ...
) -> _T_BOOL: ...
def match(
self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
) -> _TM: ...
) -> _T_BOOL: ...
def replace(
self,
pat: str,
repl: str | Callable[[re.Match], str],
repl: str | Callable[[re.Match[str]], str],
n: int = ...,
case: bool | None = ...,
flags: int = ...,
regex: bool = ...,
) -> T: ...
def repeat(self, repeats: int | Sequence[int]) -> T: ...
) -> _T_STR: ...
def repeat(self, repeats: int | Sequence[int]) -> _T_STR: ...
def pad(
self,
width: int,
side: Literal["left", "right", "both"] = ...,
fillchar: str = ...,
) -> T: ...
def center(self, width: int, fillchar: str = ...) -> T: ...
def ljust(self, width: int, fillchar: str = ...) -> T: ...
def rjust(self, width: int, fillchar: str = ...) -> T: ...
def zfill(self, width: int) -> T: ...
) -> _T_STR: ...
def center(self, width: int, fillchar: str = ...) -> _T_STR: ...
def ljust(self, width: int, fillchar: str = ...) -> _T_STR: ...
def rjust(self, width: int, fillchar: str = ...) -> _T_STR: ...
def zfill(self, width: int) -> _T_STR: ...
def slice(
self, start: int | None = ..., stop: int | None = ..., step: int | None = ...
) -> T: ...
def slice_replace(
self, start: int | None = ..., stop: int | None = ..., repl: str | None = ...
) -> T: ...
def decode(self, encoding: str, errors: str = ...) -> T: ...
def encode(self, encoding: str, errors: str = ...) -> T: ...
def strip(self, to_strip: str | None = ...) -> T: ...
def lstrip(self, to_strip: str | None = ...) -> T: ...
def rstrip(self, to_strip: str | None = ...) -> T: ...
) -> _T_STR: ...
def decode(self, encoding: str, errors: str = ...) -> _T_STR: ...
def encode(self, encoding: str, errors: str = ...) -> _T_BYTES: ...
def strip(self, to_strip: str | None = ...) -> _T_STR: ...
def lstrip(self, to_strip: str | None = ...) -> _T_STR: ...
def rstrip(self, to_strip: str | None = ...) -> _T_STR: ...
def wrap(
self,
width: int,
Expand All @@ -154,45 +173,47 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
drop_whitespace: bool | None = ...,
break_long_words: bool | None = ...,
break_on_hyphens: bool | None = ...,
) -> T: ...
def get_dummies(self, sep: str = ...) -> pd.DataFrame: ...
def translate(self, table: dict[int, int | str | None] | None) -> T: ...
def count(self, pat: str, flags: int = ...) -> Series[int]: ...
def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
def findall(self, pat: str, flags: int = ...) -> Series: ...
) -> _T_STR: ...
def get_dummies(self, sep: str = ...) -> _T_EXPANDING: ...
def translate(self, table: dict[int, int | str | None] | None) -> _T_STR: ...
def count(self, pat: str, flags: int = ...) -> _T_INT: ...
def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _T_BOOL: ...
def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _T_BOOL: ...
def findall(self, pat: str, flags: int = ...) -> _T_LIST_STR: ...
@overload
def extract(
self, pat: str, flags: int = ..., *, expand: Literal[True] = ...
) -> pd.DataFrame: ...
@overload
def extract(self, pat: str, flags: int, expand: Literal[False]) -> T: ...
def extract(self, pat: str, flags: int, expand: Literal[False]) -> _T_OBJECT: ...
@overload
def extract(self, pat: str, flags: int = ..., *, expand: Literal[False]) -> T: ...
def extract(
self, pat: str, flags: int = ..., *, expand: Literal[False]
) -> _T_OBJECT: ...
def extractall(self, pat: str, flags: int = ...) -> pd.DataFrame: ...
def find(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> T: ...
def index(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> T: ...
def len(self) -> Series[int]: ...
def lower(self) -> T: ...
def upper(self) -> T: ...
def title(self) -> T: ...
def capitalize(self) -> T: ...
def swapcase(self) -> T: ...
def casefold(self) -> T: ...
def isalnum(self) -> Series[bool]: ...
def isalpha(self) -> Series[bool]: ...
def isdigit(self) -> Series[bool]: ...
def isspace(self) -> Series[bool]: ...
def islower(self) -> Series[bool]: ...
def isupper(self) -> Series[bool]: ...
def istitle(self) -> Series[bool]: ...
def isnumeric(self) -> Series[bool]: ...
def isdecimal(self) -> Series[bool]: ...
def find(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> _T_STR: ...
def index(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> _T_INT: ...
def len(self) -> _T_INT: ...
def lower(self) -> _T_STR: ...
def upper(self) -> _T_STR: ...
def title(self) -> _T_STR: ...
def capitalize(self) -> _T_STR: ...
def swapcase(self) -> _T_STR: ...
def casefold(self) -> _T_STR: ...
def isalnum(self) -> _T_BOOL: ...
def isalpha(self) -> _T_BOOL: ...
def isdigit(self) -> _T_BOOL: ...
def isspace(self) -> _T_BOOL: ...
def islower(self) -> _T_BOOL: ...
def isupper(self) -> _T_BOOL: ...
def istitle(self) -> _T_BOOL: ...
def isnumeric(self) -> _T_BOOL: ...
def isdecimal(self) -> _T_BOOL: ...
def fullmatch(
self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
) -> Series[bool]: ...
def removeprefix(self, prefix: str) -> T: ...
def removesuffix(self, suffix: str) -> T: ...
) -> _T_BOOL: ...
def removeprefix(self, prefix: str) -> _T_STR: ...
def removesuffix(self, suffix: str) -> _T_STR: ...
Loading