-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpandas function.py
More file actions
535 lines (461 loc) · 16.5 KB
/
pandas function.py
File metadata and controls
535 lines (461 loc) · 16.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
import pandas as pd
import numpy as np
# =========================================================
# 🐼 PANDAS CHEAT SHEET (CODE + OUTPUT)
# =========================================================
# ---------------------------------------------------------
# 1. CREATE DATA
# ---------------------------------------------------------
data = {
"Name": ["Alice", "Bob", "Charlie", "David"],
"Age": [24, 27, 22, 32],
"Salary": [50000, 60000, 55000, 65000]
}
df = pd.DataFrame(data)
print(df) # Name Age Salary
# Alice 24 50000
# Bob 27 60000
# Charlie 22 55000
# David 32 65000
# ---------------------------------------------------------
# 2. BASIC INFO
# ---------------------------------------------------------
print(df.head(2)) # First 2 rows → Alice, Bob
print(df.shape) # (4, 3)
print(df.columns) # Index(['Name','Age','Salary'])
print(df.dtypes) # Name object, Age int64, Salary int64
print(df.describe()) # Count, Mean, Std, Min, Max etc.
# ---------------------------------------------------------
# 3. SELECTION & INDEXING
# ---------------------------------------------------------
print(df["Name"]) # Column 'Name'
print(df[["Name","Age"]]) # Multiple columns
print(df.iloc[1]) # Row → Bob, 27, 60000
print(df.loc[0,"Name"]) # 'Alice'
print(df[df["Age"] > 25]) # Rows where Age > 25 → Bob, David
# ---------------------------------------------------------
# 4. ADD / UPDATE / DELETE
# ---------------------------------------------------------
df["Bonus"] = df["Salary"]*0.1
print(df) # Adds Bonus column
df["Age"] = df["Age"] + 1
print(df["Age"]) # Age increased by 1
df.drop("Bonus", axis=1, inplace=True)
print(df.columns) # Bonus removed
df.drop(2, axis=0, inplace=True)
print(df) # Row index 2 (Charlie) removed
# ---------------------------------------------------------
# 5. MISSING DATA
# ---------------------------------------------------------
df.loc[1,"Salary"] = np.nan
print(df) # Bob’s Salary = NaN
print(df.isna().sum()) # Shows 1 NaN in Salary
df["Salary"].fillna(0, inplace=True)
print(df) # Bob’s Salary replaced with 0
df.dropna(inplace=True)
print(df) # Drops rows with NaN (if any)
# ---------------------------------------------------------
# 6. AGGREGATION & STATS
# ---------------------------------------------------------
print(df["Salary"].mean()) # Mean salary = 38333.3
print(df["Age"].max()) # Max age = 33
print(df["Age"].min()) # Min age = 25
print(df["Salary"].std()) # Std deviation
# ---------------------------------------------------------
# 7. GROUPING
# ---------------------------------------------------------
print(df.groupby("Age")["Salary"].mean())
# Groups salaries by Age and shows mean
# ---------------------------------------------------------
# 8. SORTING
# ---------------------------------------------------------
print(df.sort_values("Age")) # Sort by Age ascending
print(df.sort_values("Salary", ascending=False)) # Sort by Salary desc
# ---------------------------------------------------------
# 9. MERGE / JOIN / CONCAT
# ---------------------------------------------------------
df1 = pd.DataFrame({"ID":[1,2,3],"Name":["A","B","C"]})
df2 = pd.DataFrame({"ID":[1,2,4],"Marks":[85,90,95]})
print(pd.merge(df1,df2,on="ID",how="inner"))
# ID 1,2 common
print(pd.concat([df1,df2],axis=0)) # Vertical concat
print(pd.concat([df1,df2],axis=1)) # Horizontal concat
# ---------------------------------------------------------
# 10. APPLY FUNCTIONS
# ---------------------------------------------------------
df["Salary_after_tax"] = df["Salary"].apply(lambda x: x*0.8)
print(df) # Adds Salary_after_tax column
print(df.apply(np.max,axis=0)) # Column-wise max values
print(df.apply(np.min,axis=1)) # Row-wise min values
# ---------------------------------------------------------
# 11. FILE OPERATIONS
# ---------------------------------------------------------
df.to_csv("output.csv",index=False)
df_loaded = pd.read_csv("output.csv")
# ---------------------------------------------------------
# 12. ADVANCED OPERATIONS
# ---------------------------------------------------------
df_pivot = pd.DataFrame({
"Department":["HR","HR","IT","IT","Sales"],
"Employee":["Alice","Bob","Charlie","David","Eve"],
"Salary":[50000,52000,60000,65000,70000]
})
print(df_pivot.pivot_table(index="Department",values="Salary",aggfunc="mean"))
# HR 51000, IT 62500, Sales 70000
print(pd.crosstab(df_pivot["Department"],df_pivot["Salary"]>55000))
# Boolean frequency count per dept
df["Name"] = df["Name"].str.upper()
print(df["Name"]) # Converts to uppercase
dates = pd.date_range("2025-01-01",periods=5,freq="D")
print(dates) # 2025-01-01 to 2025-01-05
# ---------------------------------------------------------
# 13. BOOLEAN FUNCTIONS
# ---------------------------------------------------------
print(df["Age"].all()) # True if all Age nonzero
print(df["Age"].any()) # True if any Age nonzero
print((df["Age"]>25).all()) # True if all > 25
#----------------------------------------------------------
# 14 . ALL ABOUT SERIES
#----------------------------------------------------------
data1= [101 , 102 , 104 , 105]
series = pd.Series(data1 , index=["A","B","C","D"])
print(series[series > 103])
# =====================================================
# 📌 PANDAS FILTERING & SELECTION CHEAT SHEET
# =====================================================
import pandas as pd
# Sample DataFrame
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
"Age": [25, 30, 35, 40, 22],
"City": ["NY", "LA", "NY", "Chicago", "LA"],
"Score": [85, 92, 88, 70, 95]
})
print(df)
# -----------------------------------------------------
# Original DataFrame
# -----------------------------------------------------
# Name Age City Score
# 0 Alice 25 NY 85
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
# 3 David 40 Chicago 70
# 4 Eve 22 LA 95
# -----------------------------------------------------
# 1. COLUMN SELECTION
# -----------------------------------------------------
print(df["Name"])
# 0 Alice
# 1 Bob
# 2 Charlie
# 3 David
# 4 Eve
# Name: Name, dtype: object
print(df[["Name", "Age"]])
# Name Age
# 0 Alice 25
# 1 Bob 30
# 2 Charlie 35
# 3 David 40
# 4 Eve 22
# -----------------------------------------------------
# 2. ROW SELECTION
# -----------------------------------------------------
print(df.loc[0])
# Name Alice
# Age 25
# City NY
# Score 85
# Name: 0, dtype: object
print(df.iloc[2])
# Name Charlie
# Age 35
# City NY
# Score 88
# Name: 2, dtype: object
print(df.loc[1:3])
# Name Age City Score
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
# 3 David 40 Chicago 70
print(df.iloc[1:3])
# Name Age City Score
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
# -----------------------------------------------------
# 3. CONDITIONAL FILTERING
# -----------------------------------------------------
print(df[df["Age"] > 30])
# Name Age City Score
# 2 Charlie 35 NY 88
# 3 David 40 Chicago 70
print(df[df["City"] == "LA"])
# Name Age City Score
# 1 Bob 30 LA 92
# 4 Eve 22 LA 95
print(df[(df["Age"] > 25) & (df["Score"] > 80)])
# Name Age City Score
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
print(df[(df["Age"] < 30) | (df["City"] == "NY")])
# Name Age City Score
# 0 Alice 25 NY 85
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
# 4 Eve 22 LA 95
# -----------------------------------------------------
# 4. FILTER SPECIFIC VALUES
# -----------------------------------------------------
print(df[df["City"].isin(["NY", "Chicago"])])
# Name Age City Score
# 0 Alice 25 NY 85
# 2 Charlie 35 NY 88
# 3 David 40 Chicago 70
# -----------------------------------------------------
# 5. FILTER MISSING VALUES
# -----------------------------------------------------
print(df[df["Score"].notna()])
# (Same as df, since no NaN)
print(df[df["Score"].isna()])
# Empty DataFrame
# Columns: [Name, Age, City, Score]
# Index: []
# -----------------------------------------------------
# 6. FILTER BY INDEX / LABELS
# -----------------------------------------------------
print(df.loc[[0, 2, 4], ["Name", "Score"]])
# Name Score
# 0 Alice 85
# 2 Charlie 88
# 4 Eve 95
# -----------------------------------------------------
# 7. QUERY METHOD
# -----------------------------------------------------
print(df.query("Age > 30 and City == 'NY'"))
# Name Age City Score
# 2 Charlie 35 NY 88
# =====================================================
# 📌 PANDAS AGGREGATION CHEAT SHEET
# =====================================================
import pandas as pdy
# Sample DataFrame
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace"],
"Age": [25, 30, 35, 40, 22, 30, 35],
"City": ["NY", "LA", "NY", "Chicago", "LA", "NY", "Chicago"],
"Score": [85, 92, 88, 70, 95, 80, 75]
})
print(df)
# -----------------------------------------------------
# Original DataFrame
# -----------------------------------------------------
# Name Age City Score
# 0 Alice 25 NY 85
# 1 Bob 30 LA 92
# 2 Charlie 35 NY 88
# 3 David 40 Chicago 70
# 4 Eve 22 LA 95
# 5 Frank 30 NY 80
# 6 Grace 35 Chicago 75
# -----------------------------------------------------
# 1. BASIC AGGREGATIONS
# -----------------------------------------------------
print(df["Score"].sum()) # 585
print(df["Score"].mean()) # 83.57142857142857
print(df["Score"].min()) # 70
print(df["Score"].max()) # 95
print(df["Score"].count()) # 7
print(df["Score"].median()) # 85.0
print(df["Score"].std()) # 9.257491872231993
# -----------------------------------------------------
# 2. MULTIPLE AGGREGATIONS ON A COLUMN
# -----------------------------------------------------
print(df["Score"].agg(["sum", "mean", "min", "max"]))
# sum 585.000000
# mean 83.571429
# min 70.000000
# max 95.000000
# Name: Score, dtype: float64
# -----------------------------------------------------
# 3. AGGREGATIONS ON WHOLE DATAFRAME
# -----------------------------------------------------
print(df.agg({"Age": ["mean", "max"], "Score": ["min", "max", "mean"]}))
# Age Score
# mean 31.000000 83.571429
# max 40.000000 95.000000
# min NaN 70.000000
# -----------------------------------------------------
# 4. GROUPBY AGGREGATION
# -----------------------------------------------------
print(df.groupby("City")["Score"].mean())
# City
# Chicago 72.5
# LA 93.5
# NY 84.333333
# Name: Score, dtype: float64
print(df.groupby("City")["Score"].agg(["min", "max", "mean"]))
# min max mean
# City
# Chicago 70 75 72.500000
# LA 92 95 93.500000
# NY 80 88 84.333333
# -----------------------------------------------------
# 5. MULTIPLE GROUPBY
# -----------------------------------------------------
print(df.groupby(["City", "Age"])["Score"].mean())
# City Age
# Chicago 35 75
# 40 70
# LA 22 95
# 30 92
# NY 25 85
# 30 80
# 35 88
# Name: Score, dtype: int64
# -----------------------------------------------------
# 6. VALUE COUNTS (special aggregation)
# -----------------------------------------------------
print(df["City"].value_counts())
# NY 3
# Chicago 2
# LA 2
# Name: City, dtype: int64
print(df["Age"].value_counts())
# 30 2
# 35 2
# 25 1
# 40 1
# 22 1
# Name: Age, dtype: int64
# -----------------------------------------------------
# 7. PIVOT TABLE (Aggregation + Reshape)
# -----------------------------------------------------
print(pd.pivot_table(df, values="Score", index="City", columns="Age", aggfunc="mean"))
# Age 22 25 30 35 40
# City
# Chicago NaN NaN NaN 75.0 70.0
# LA 95.0 NaN 92.0 NaN NaN
# NY NaN 85.0 80.0 88.0 NaN
# Sample DataFrame
df = pd.DataFrame({
"Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
"Age": [25, 30, 35, 40, 22],
"City": ["NY", "LA", "NY", "Chicago", "LA"],
"Score": [85, 92, 88, 70, 95]
})
print(df)
# -----------------------------------------------------
# Aggregation on whole DataFrame (numeric_only=True)
# -----------------------------------------------------
result = df.agg(["sum", "mean", "min", "max"], numeric_only=True)
print(result)
# ✅ Output:
# Age Score
# sum 152 430
# mean 30.4 86.0
# min 22 70
# max 40 95
# =====================================================
# 📌 PANDAS DATA CLEANING CHEAT SHEET
# =====================================================
import pandas as pd
import numpy as np
# Sample messy dataset
df = pd.DataFrame({
"Name": ["Alice", "Bob", None, "David", "Eve", "Frank", "Grace"],
"Age": [25, None, 35, 40, "?", 30, 29],
"City": ["NY", "LA", "NY", None, "LA", "NY", "Chicago"],
"Score": [85, 92, np.nan, 70, 95, "missing", 75]
})
print(df)
# -----------------------------------------------------
# Original DataFrame
# -----------------------------------------------------
# Name Age City Score
# 0 Alice 25 NY 85
# 1 Bob None LA 92
# 2 None 35 NY NaN
# 3 David 40 None 70
# 4 Eve ? LA 95
# 5 Frank 30 NY missing
# 6 Grace 29 Chicago 75
# -----------------------------------------------------
# 1. DEALING WITH MISSING VALUES
# -----------------------------------------------------
print(df.isna()) # Check missing values
print(df.notna()) # Opposite of isna()
print(df.dropna()) # Drop rows with any NaN
# Name Age City Score
# 0 Alice 25 NY 85
# 4 Eve ? LA 95
# 5 Frank 30 NY missing
# 6 Grace 29 Chicago 75
print(df.fillna(0)) # Fill missing values with 0
# Name Age City Score
# Alice 25 NY 85
# Bob 0 LA 92
# None 35 NY 0
# David 40 0 70
# Eve ? LA 95
# Frank 30 NY missing
# Grace 29 Chicago 75
print(df.fillna(method="ffill")) # Forward fill
print(df.fillna(method="bfill")) # Backward fill
# -----------------------------------------------------
# 2. HANDLING WRONG DATA / REPLACING
# -----------------------------------------------------
df["Age"] = df["Age"].replace("?", np.nan)
df["Score"] = df["Score"].replace("missing", np.nan)
print(df)
# Name Age City Score
# 0 Alice 25 NY 85
# 1 Bob NaN LA 92
# 2 None 35 NY NaN
# 3 David 40 None 70
# 4 Eve NaN LA 95
# 5 Frank 30 NY NaN
# 6 Grace 29 Chicago 75
# -----------------------------------------------------
# 3. CONVERT DATA TYPES
# -----------------------------------------------------
df["Age"] = pd.to_numeric(df["Age"], errors="coerce") # Convert to numeric
df["Score"] = pd.to_numeric(df["Score"], errors="coerce") # Convert to numeric
print(df.dtypes)
# Name object
# Age float64
# City object
# Score float64
# -----------------------------------------------------
# 4. REMOVE DUPLICATES
# -----------------------------------------------------
df2 = pd.DataFrame({
"Name": ["Alice", "Bob", "Alice", "David"],
"Age": [25, 30, 25, 40]
})
print(df2.drop_duplicates())
# Name Age
# 0 Alice 25
# 1 Bob 30
# 3 David 40
# -----------------------------------------------------
# 5. RENAMING COLUMNS
# -----------------------------------------------------
df2.rename(columns={"Name": "FullName", "Age": "Years"}, inplace=True)
print(df2)
# FullName Years
# 0 Alice 25
# 1 Bob 30
# 2 Alice 25
# 3 David 40
# -----------------------------------------------------
# 6. STRING CLEANING
# -----------------------------------------------------
df3 = pd.DataFrame({"City": [" new york ", "los angeles", "CHICAGO ", "ny "]})
df3["City"] = df3["City"].str.strip().str.title()
print(df3)
# City
# 0 New York
# 1 Los Angeles
# 2 Chicago
# 3 Ny
# ------------------