[SPARK-54318][PYTHON][DOCS] Fix doctests in pyspark.sql.dataframe

zhengruifeng · zhengruifeng · commit 674ed486781f · 2025-11-13T14:51:54.000+08:00
### What changes were proposed in this pull request? Fix doctests in `pyspark.sql.dataframe` ### Why are the changes needed? to refine docstrings and improve test coverage ### Does this PR introduce _any_ user-facing change? yes, doc-only changes ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #53013 from zhengruifeng/doctest_df. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -2099,13 +2099,13 @@ def sample(
 
         Examples
         --------
-        >>> df = spark.range(10)
+        >>> df = spark.range(0, 10, 1, 1)
         >>> df.sample(0.5, 3).count() # doctest: +SKIP
         7
-        >>> df.sample(fraction=0.5, seed=3).count() # doctest: +SKIP
-        7
-        >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count() # doctest: +SKIP
-        1
+        >>> df.sample(fraction=0.5, seed=3).count()
+        4
+        >>> df.sample(withReplacement=True, fraction=0.5, seed=3).count()
+        2
         >>> df.sample(1.0).count()
         10
         >>> df.sample(fraction=1.0).count()
@@ -2187,8 +2187,8 @@ def sampleBy(
 
         Examples
         --------
-        >>> from pyspark.sql.functions import col
-        >>> dataset = spark.range(0, 100, 1, 5).select((col("id") % 3).alias("key"))
+        >>> from pyspark.sql import functions as sf
+        >>> dataset = spark.range(0, 100, 1, 5).select((sf.col("id") % 3).alias("key"))
         >>> sampled = dataset.sampleBy("key", fractions={0: 0.1, 1: 0.2}, seed=0)
         >>> sampled.groupBy("key").count().orderBy("key").show()
         +---+-----+
@@ -2198,7 +2198,7 @@ def sampleBy(
         |  1|    9|
         +---+-----+
 
-        >>> dataset.sampleBy(col("key"), fractions={2: 1.0}, seed=0).count()
+        >>> dataset.sampleBy(sf.col("key"), fractions={2: 1.0}, seed=0).count()
         33
         """
         ...
@@ -2315,9 +2315,9 @@ def columns(self) -> List[str]:
 
         Example 4: Iterating over columns to apply a transformation
 
-        >>> import pyspark.sql.functions as f
+        >>> import pyspark.sql.functions as sf
         >>> for col_name in df.columns:
-        ...     df = df.withColumn(col_name, f.upper(f.col(col_name)))
+        ...     df = df.withColumn(col_name, sf.upper(col_name))
         >>> df.show()
         +---+-----+-----+
         |age| name|state|
@@ -2478,14 +2478,16 @@ def alias(self, alias: str) -> "DataFrame":
 
         Examples
         --------
-        >>> from pyspark.sql.functions import col, desc
+        >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame(
         ...     [(14, "Tom"), (23, "Alice"), (16, "Bob")], ["age", "name"])
         >>> df_as1 = df.alias("df_as1")
         >>> df_as2 = df.alias("df_as2")
-        >>> joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), 'inner')
+        >>> joined_df = df_as1.join(df_as2,
+        ...     sf.col("df_as1.name") == sf.col("df_as2.name"), 'inner')
         >>> joined_df.select(
-        ...     "df_as1.name", "df_as2.name", "df_as2.age").sort(desc("df_as1.name")).show()
+        ...     "df_as1.name", "df_as2.name", "df_as2.age"
+        ... ).sort(sf.desc("df_as1.name")).show()
         +-----+-----+---+
         | name| name|age|
         +-----+-----+---+
@@ -2610,7 +2612,7 @@ def join(
         they will appear with `NULL` in the `name` column of `df`, and vice versa for `df2`.
 
         >>> joined = df.join(df2, df.name == df2.name, "outer").sort(sf.desc(df.name))
-        >>> joined.show() # doctest: +SKIP
+        >>> joined.show()
         +-----+----+----+------+
         | name| age|name|height|
         +-----+----+----+------+
@@ -2621,7 +2623,7 @@ def join(
 
         To unambiguously select output columns, specify the dataframe along with the column name:
 
-        >>> joined.select(df.name, df2.height).show() # doctest: +SKIP
+        >>> joined.select(df.name, df2.height).show()
         +-----+------+
         | name|height|
         +-----+------+
@@ -4404,11 +4406,11 @@ def observe(
         --------
         When ``observation`` is :class:`Observation`, only batch queries work as below.
 
-        >>> from pyspark.sql.functions import col, count, lit, max
-        >>> from pyspark.sql import Observation
+        >>> from pyspark.sql import Observation, functions as sf
         >>> df = spark.createDataFrame([(2, "Alice"), (5, "Bob")], schema=["age", "name"])
         >>> observation = Observation("my metrics")
-        >>> observed_df = df.observe(observation, count(lit(1)).alias("count"), max(col("age")))
+        >>> observed_df = df.observe(observation,
+        ...     sf.count(sf.lit(1)).alias("count"), sf.max("age"))
         >>> observed_df.count()
         2
         >>> observation.get
@@ -4441,13 +4443,13 @@ def observe(
         >>> error_listener = MyErrorListener()
         >>> spark.streams.addListener(error_listener)
         >>> sdf = spark.readStream.format("rate").load().withColumn(
-        ...     "error", col("value")
+        ...     "error", sf.col("value")
         ... )
         >>> # Observe row count (rc) and error row count (erc) in the streaming Dataset
         ... observed_ds = sdf.observe(
         ...     "my_event",
-        ...     count(lit(1)).alias("rc"),
-        ...     count(col("error")).alias("erc"))
+        ...     sf.count(sf.lit(1)).alias("rc"),
+        ...     sf.count(sf.col("error")).alias("erc"))
         >>> try:
         ...     q = observed_ds.writeStream.format("console").start()
         ...     time.sleep(5)
@@ -4512,11 +4514,11 @@ def union(self, other: "DataFrame") -> "DataFrame":
 
         Example 2: Combining two DataFrames with different schemas
 
-        >>> from pyspark.sql.functions import lit
+        >>> from pyspark.sql import functions as sf
         >>> df1 = spark.createDataFrame([(100001, 1), (100002, 2)], schema="id LONG, money INT")
         >>> df2 = spark.createDataFrame([(3, 100003), (4, 100003)], schema="money INT, id LONG")
-        >>> df1 = df1.withColumn("age", lit(30))
-        >>> df2 = df2.withColumn("age", lit(40))
+        >>> df1 = df1.withColumn("age", sf.lit(30))
+        >>> df2 = df2.withColumn("age", sf.lit(40))
         >>> df3 = df1.union(df2)
         >>> df3.show()
         +------+------+---+
@@ -6065,10 +6067,10 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any)
 
         Examples
         --------
-        >>> from pyspark.sql.functions import col
+        >>> from pyspark.sql import functions as sf
         >>> df = spark.createDataFrame([(1, 1.0), (2, 2.0)], ["int", "float"])
         >>> def cast_all_to_int(input_df):
-        ...     return input_df.select([col(col_name).cast("int") for col_name in input_df.columns])
+        ...     return input_df.select([sf.col(c).cast("int") for c in input_df.columns])
         ...
         >>> def sort_columns_asc(input_df):
         ...     return input_df.select(*sorted(input_df.columns))
@@ -6082,8 +6084,9 @@ def transform(self, func: Callable[..., "DataFrame"], *args: Any, **kwargs: Any)
         +-----+---+
 
         >>> def add_n(input_df, n):
-        ...     return input_df.select([(col(col_name) + n).alias(col_name)
-        ...                             for col_name in input_df.columns])
+        ...     cols = [(sf.col(c) + n).alias(c) for c in input_df.columns]
+        ...     return input_df.select(cols)
+        ...
         >>> df.transform(add_n, 1).transform(add_n, n=10).show()
         +---+-----+
         |int|float|