-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPredict Earnings Surprises.py
630 lines (479 loc) · 36.2 KB
/
Predict Earnings Surprises.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
#%%
# =============================================================================
# To do: Change the directory variable to the location of
# the folder "Thesis Submission_GiaPhatTram"
# For example: E:\Students\Thesis\Thesis Submission - Gia Phat Tram
# =============================================================================
import os
directory = "D:\Thesis Submission - Gia Phat Tram"
os.chdir(directory)
#%%
# =============================================================================
# Import relevant packages
# =============================================================================
import pandas as pd
import numpy as np
import os
from datetime import datetime
#%%
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
# File Description: Forecasting Earnings Surprises
# Main outcome: Section 7 allows for choosing models and generating results
# which are displayed in console
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
# =============================================================================
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 1: Preparing IBES-CRSP Link Table
# Note: keep only score 1 and score 2
# Main outcome variable: ibes_crsp_link_score1 and ibes_crsp_link_score2
# ==========================================================================================================================================================
# ==========================================================================================================================================================
ibes_crsp_link = pd.read_csv("./Data/Securities/[stock] ibes_crsp_link (from wrds).csv", sep = ";")
ibes_crsp_link['sdate'] = pd.to_datetime(ibes_crsp_link.sdate, format = '%d-%b-%y')
ibes_crsp_link['edate'] = pd.to_datetime(ibes_crsp_link.edate, format = '%d-%b-%y')
#Keep only score 1 and 2
ibes_crsp_link = ibes_crsp_link[ibes_crsp_link.SCORE.isin([1,2])]
#Transform the data so it can be mapped to IBES data later
ibes_crsp_link = pd.melt(ibes_crsp_link, id_vars = ['TICKER', 'PERMNO', 'NCUSIP', 'SCORE'], value_name = 'Date').drop('variable', axis = 1)
ibes_crsp_link = ibes_crsp_link.sort_values(['PERMNO', 'Date'])
ibes_crsp_link.set_index('Date', inplace = True)
ibes_crsp_link = ibes_crsp_link.groupby(['TICKER', 'PERMNO', 'NCUSIP', 'SCORE']).resample('D').ffill()
ibes_crsp_link = ibes_crsp_link.reset_index(level = [0,1,2,3], drop = True)
ibes_crsp_link = ibes_crsp_link.reset_index()
ibes_crsp_link = ibes_crsp_link.sort_values(['PERMNO', 'Date'])
#Split ibes_crsp_link into two dataframes, one for score of 1 and one for score of 2
ibes_crsp_link_score1 = ibes_crsp_link.copy(deep = True)
ibes_crsp_link_score1 = ibes_crsp_link_score1[ibes_crsp_link_score1.SCORE == 1]
ibes_crsp_link_score2 = ibes_crsp_link.copy(deep = True)
ibes_crsp_link_score2 = ibes_crsp_link_score2[ibes_crsp_link_score2.SCORE == 2]
del ibes_crsp_link
#%%
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 2: Import IBES data into variable esp_estimates
# and use IBES-CRSP Link Table to map Permno to IBES data
# Main outcome variable: esp_estimates
# ==========================================================================================================================================================
# ==========================================================================================================================================================
#Import IBES data
esp_estimates = pd.read_csv("./Data/Securities/[stock] IBES - ESP Estimates.csv")
# Format datetime variables
esp_estimates['ANNDATS'] = pd.to_datetime(esp_estimates.ANNDATS, format = "%d/%m/%Y")
esp_estimates['FPEDATS'] = pd.to_datetime(esp_estimates.FPEDATS, format = "%d/%m/%Y")
esp_estimates['ANNDATS_ACT'] = pd.to_datetime(esp_estimates.ANNDATS_ACT, format = "%d/%m/%Y")
# Merge permno from ibes_crsp_link_score1 to ibes data
esp_estimates = pd.merge(left = esp_estimates, right = ibes_crsp_link_score1,\
left_on = ['CUSIP', 'ANNDATS'], \
right_on = ['NCUSIP', 'Date'], how = 'left').\
drop(['TICKER_y', 'Date', 'NCUSIP', 'SCORE'], axis = 1)
esp_estimates.rename({'TICKER_x':'TICKER'}, axis = 1, inplace = True)
# Merge permno from ibes_crsp_link_score2 to ibes data
esp_estimates = pd.merge(left = esp_estimates, right = ibes_crsp_link_score2,\
left_on = ['TICKER', 'ANNDATS'], \
right_on = ['TICKER', 'Date'], how = 'left').\
drop(['Date', 'NCUSIP', 'SCORE'], axis = 1)
del ibes_crsp_link_score1, ibes_crsp_link_score2
# Check if score 1 and score 2 map to overlapping observations
print("Score1 and Score2 map to unique rows in eps_esimates: {}".format(\
esp_estimates[esp_estimates.PERMNO_x.notnull()&esp_estimates.PERMNO_y.notnull()].shape[0]==0))
# Double check: in case score 1 and score 2 map to overlapping observations,
# choose permno from score 1
esp_estimates['PERMNO'] = np.where(esp_estimates.PERMNO_x.isnull(), \
esp_estimates.PERMNO_y,\
esp_estimates.PERMNO_x)
esp_estimates.drop(['PERMNO_x', 'PERMNO_y'], axis = 1, inplace = True)
#drop rows where there is no PERMNO or no actual
esp_estimates = esp_estimates[(esp_estimates.PERMNO.notnull())&(esp_estimates.ACTUAL.notnull())]
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 3: Keep the last estimate in the intended quarter according to FPI
#
# Explanation: The variable FPI indicates which earnings actual the forecast is intended for.
# For example, FPI = 6 indicates that the forecast is for the earnings actual in the next quarter.
# FPI = 9 indicates that the forecast is for the earnings actual four quarters ahead.
# However, as I can observe from the dataset, FPI may not always work as intended. For example,
# after an analyst makes a forecast for the earnings actual 4 quarters ahead, in which case
# the FPI for this forecast is 9, the analyst can revise the forecast multiple times, each revision
# appears in the dataset in a new observation. Oftentimes, the forecast revision gets into another quarter,
# even though the FPI is unchanged.
# Therefore, this section will clean the dataset such that it will only keep the last estimate in
# the quarter intended by FPI. If there are more than estimates within the intended quarter,
# then I keep the last estimate in that quarter.
#
# Important assumption: actual announcement is for the quarter in which the announcement takes places
# Assumption 2 is not correct. For some companies, e.g. CUSIP 36720410
# can have two announcements in the same quarter, one for the current quarter and
# one for the previous quarter. The data is so unsystematic I couldn't find
# a way to control for this yet. Fortunately, assumption 2 largely holds for
# most of the time
#
# Main outcome variable: esp_estimates
# ==========================================================================================================================================================
# ==========================================================================================================================================================
esp_estimates = esp_estimates[['PERMNO', 'TICKER', 'CUSIP', 'CNAME', 'ESTIMATOR', 'FPI',\
'FPEDATS', 'ANNDATS', 'VALUE', 'ACTUAL', 'ANNDATS_ACT',]]
# =============================================================================
# Since each firm may have a different fiscal calendar, I align the fiscal calendar for all observations
# to standard calendar (year starting with January)
# Important assumption: months in FPEDATS indicate fiscal quarter of the firm/stock
# =============================================================================
esp_estimates['fpemonth'] = esp_estimates.FPEDATS.dt.month
conditions = [esp_estimates.fpemonth.isin([1,4,7,10]),\
esp_estimates.fpemonth.isin([2,5,8,11]),\
esp_estimates.fpemonth.isin([3,6,9,12])]
choices = [pd.offsets.DateOffset(months = 1),\
pd.offsets.DateOffset(months = 2),\
pd.offsets.DateOffset(months = 0)]
esp_estimates['align_month'] = np.select(conditions, choices)
esp_estimates['FPEDATS'] = np.where(esp_estimates.fpemonth.isin([1,4,7,10,2,5,8,11]),\
esp_estimates.FPEDATS - esp_estimates.align_month,\
esp_estimates.FPEDATS)
esp_estimates['ANNDATS'] = np.where(esp_estimates.fpemonth.isin([1,4,7,10,2,5,8,11]),\
esp_estimates.ANNDATS - esp_estimates.align_month,\
esp_estimates.ANNDATS)
esp_estimates['ANNDATS_ACT'] = np.where(esp_estimates.fpemonth.isin([1,4,7,10,2,5,8,11]),\
esp_estimates.ANNDATS_ACT - esp_estimates.align_month,\
esp_estimates.ANNDATS_ACT)
# =============================================================================
#Create a variable slided_actual_annc that slides actual announcement date back according to FPI,
#this variable allows for pinning down which quarter is intended by FPI
# =============================================================================
conditions = [esp_estimates.FPI == 6,\
esp_estimates.FPI == 7,\
esp_estimates.FPI == 8,\
esp_estimates.FPI == 9]
choices = [pd.offsets.DateOffset(months = 3),\
pd.offsets.DateOffset(months = 6),\
pd.offsets.DateOffset(months = 9),\
pd.offsets.DateOffset(months = 12)]
esp_estimates['slided_actual_annc']= np.select(conditions, choices)
del conditions, choices
esp_estimates['slided_actual_annc'] = esp_estimates.ANNDATS_ACT - esp_estimates.slided_actual_annc
#slide this to the end of the intended quarter
esp_estimates['slided_actual_annc'] = np.where(esp_estimates.slided_actual_annc.dt.is_quarter_end,\
esp_estimates.slided_actual_annc,\
esp_estimates.slided_actual_annc + pd.tseries.offsets.QuarterEnd())
#keep only rows where estimate announcement date is in the same quarter with
# slided_actual_annc
esp_estimates = esp_estimates[(esp_estimates.ANNDATS.dt.quarter == esp_estimates.slided_actual_annc.dt.quarter)&\
(esp_estimates.ANNDATS.dt.year == esp_estimates.slided_actual_annc.dt.year)]
esp_estimates_before = esp_estimates.copy(deep = True)
print('Before idxmax, the dataset length is {}'.format(esp_estimates.shape[0]))
# For each stock, for each actual announcement and by each estimator,
# keep only the rows of the last estimate in the quarter intended by FPI
esp_estimates = esp_estimates.loc[esp_estimates.groupby(['PERMNO', 'ESTIMATOR', 'FPI', 'ANNDATS_ACT']).ANNDATS.idxmax()]
# Check whether after the cleaning,
# for each firm, each actual announcement, each estimator (analyst),
# there are now only unique FPI
temp1 = esp_estimates.groupby(['PERMNO','ESTIMATOR', 'ANNDATS_ACT']).FPI.count().reset_index().rename({'FPI':'row_count'}, axis =1)
temp2 = esp_estimates.groupby(['PERMNO','ESTIMATOR', 'ANNDATS_ACT']).FPI.nunique().reset_index().rename({'FPI':'unique_FPI'}, axis =1 )
temp = pd.merge(left = temp1, right = temp2, on = ['PERMNO', 'ESTIMATOR', 'ANNDATS_ACT'], how = 'inner')
temp['check'] = temp.row_count == temp.unique_FPI
print("After idxmax, for each stock, each estimator and each actual announcement, there are now only unique FPI: {}".format(temp[temp.check==False].shape[0]==0))
del temp, temp1, temp2
esp_estimates.drop(['fpemonth', 'align_month'], axis = 1, inplace = True)
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 4: Compute unexpected earnings (earnings surprises)
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# =============================================================================
# Sliding date variables
# =============================================================================
#Sliding ANNDATS to quarter end
esp_estimates['slided_ANNDATS'] = np.where(esp_estimates.ANNDATS.dt.is_quarter_end,\
esp_estimates.ANNDATS,\
esp_estimates.ANNDATS + pd.tseries.offsets.QuarterEnd())
# Check if slided_ANNDATS == slided_actual_annc
# Answer: yes. Uncomment to run and check
#temp = esp_estimates.slided_ANNDATS == esp_estimates.slided_actual_annc
#temp = esp_estimates[temp==False]
#temp[temp==False].shape
#del temp
esp_estimates.drop('slided_actual_annc', axis =1, inplace = True)
#Sliding ANNDATS_ACT to quarter end
esp_estimates['slided_ANNDATS_ACT'] = np.where(esp_estimates.ANNDATS_ACT.dt.is_quarter_end,\
esp_estimates.ANNDATS_ACT,\
esp_estimates.ANNDATS_ACT + pd.tseries.offsets.QuarterEnd())
# =============================================================================
# Calculate the mean of analyst estimates for each stock-fpi-earnings announcement
# and
# Calculate unexpected earnings
# =============================================================================
#Find the mean of estimates for each stock-fpi-earning annoucement
temp = esp_estimates.copy(deep = True)
temp = temp.groupby(['PERMNO', 'FPI', 'ANNDATS_ACT']).agg({'TICKER':'first',
'CUSIP': 'first', 'CNAME': 'first', 'FPEDATS':'first',\
'ANNDATS':'first', 'VALUE': 'mean', 'ACTUAL': 'mean',\
'slided_ANNDATS': 'first', 'slided_ANNDATS_ACT':'first'}).reset_index()
temp.rename({'ACTUAL':'ExpectedACTUAL'}, axis = 1, inplace = True) #because the estimate is the median or mean
#in few instances, for each permno and FPI, there could be many earning announcement in the same quarter
# This shows violation of the assumption mentioned in Section 3
# To expedite the analysis, I take the last earning announcement within a quarter
temp.sort_values(['PERMNO', 'FPI', 'ANNDATS_ACT'], inplace = True)
temp = temp.groupby(['PERMNO', 'FPI', 'slided_ANNDATS_ACT']).last()
temp.reset_index(inplace = True)
#Calculate unexpected earnings
temp['UE'] = temp.VALUE - temp.ExpectedACTUAL
temp = temp[['PERMNO', 'FPI', 'CNAME', 'ANNDATS', 'slided_ANNDATS','slided_ANNDATS_ACT', 'UE']]
#Split the resulting dataframe into
# (1) unexpected earnings for earnings actual one quarter ahead
ibesUE6 = temp.copy(deep = True)
ibesUE6 = ibesUE6 [ibesUE6 .FPI == 6].drop('FPI', axis = 1)
ibesUE6.rename({'slided_ANNDATS': 'estimate3M', 'slided_ANNDATS_ACT': 'announcement3M', 'UE': 'UE3M'}, axis = 1, inplace = True)
# (2) unexpected earnings for earnings actual two quarters ahead
ibesUE7 = temp.copy(deep = True)
ibesUE7 = ibesUE7[ibesUE7.FPI == 7].drop('FPI', axis = 1)
ibesUE7.rename({'slided_ANNDATS': 'estimate6M', 'slided_ANNDATS_ACT': 'announcement6M', 'UE': 'UE6M'}, axis = 1, inplace = True)
# (3) unexpected earnings for earnings actual three quarters ahead
ibesUE8 = temp.copy(deep = True)
ibesUE8 = ibesUE8[ibesUE8.FPI == 8].drop('FPI', axis =1)
ibesUE8.rename({'slided_ANNDATS': 'estimate9M', 'slided_ANNDATS_ACT': 'announcement9M', 'UE': 'UE9M'}, axis = 1, inplace = True)
# (4) unexpected earnings for earnings actual four quarters ahead
ibesUE9 = temp.copy(deep = True)
ibesUE9 = ibesUE9[ibesUE9.FPI == 9].drop('FPI', axis =1)
ibesUE9.rename({'slided_ANNDATS': 'estimate12M', 'slided_ANNDATS_ACT': 'announcement12M', 'UE': 'UE12M'}, axis = 1, inplace = True)
del temp
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 5: Calculate aggregated active weight and aggregated squared active weight
# Main outcome variable: aggrHoldings
# ==========================================================================================================================================================
# ==========================================================================================================================================================
#Create a copy of mfHoldings to avoid modifying the variable itself
holdings = mfHoldings.copy(deep = True)
holdings.replace([np.inf, -np.inf], np.nan, inplace = True)
# =============================================================================
# Calculate active weight change, accounting to passive weight change due to price changes
# This is similarly done in the calculation of trade-based AFP,
# function calculateTrade_BasedAFP in AFP Analysis Part 1 - Univariate.py
# I copy the explanation here again for convenience
# Explanation: I will attempt to explain how I calculate the active weight change
# while accounting for passive weight change due to price changes.
# The variable mfHoldings contains three columns "PriceFromForward3M", "PriceFromForward6M", "PriceFromForward12M",
# which contains price from 3, 6 and 12 months ahead. These prices are then multiplied by the
# contemporaneous number of shares held nbr_shares. The products are then divided by
# the contemporaneous total market value of the fund, the quotients are recorded in
# three columns 'passiveWeight3M', 'passiveWeight6M', 'passiveWeight12M'. So as you can see,
# these passive weight columns are calculated using future prices. In other to calculate active weight changes,
# I need to take these three columns out of the variable mfHoldings, slide the date forward so that
# the passive weight is aligned with the right period to calculate active weight.
# For example, I can take the column passiveWeight3M out of mfHoldings, slide the date forward by 3 months
# when the price was used to calculate passiveWeight3M, then I merge this back to mfHoldings.
# Because the date has been slided forward by 3M, the column passiveWeight3M is not correctly aligned
# with the observation that contains the price that was used to calculate passiveWeight3M.
# I repeat this for passiveWeight6M and passiveWeight12M.
# Actually this is my bad coding practice. I shouldn't keep the there variables passiveWeights in mfHoldings,
# but to put them in a separate variable to avoid confusion.
# =============================================================================
holdings3M = mfHoldings.copy(deep = True)
holdings3M = holdings3M[['crsp_portno', 'security_name', 'slided_report_dt', 'passiveWeight3M']]
holdings3M['slided_report_dt'] = holdings3M.slided_report_dt + pd.offsets.DateOffset(months = 3)
holdings3M['slided_report_dt'] = np.where(holdings3M.slided_report_dt.dt.is_quarter_end,\
holdings3M.slided_report_dt,\
holdings3M.slided_report_dt + pd.tseries.offsets.QuarterEnd())
holdings6M = mfHoldings.copy(deep = True)
holdings6M = holdings6M[['crsp_portno', 'security_name', 'slided_report_dt', 'passiveWeight6M']]
holdings6M['slided_report_dt'] = holdings6M.slided_report_dt + pd.offsets.DateOffset(months = 6)
holdings6M['slided_report_dt'] = np.where(holdings6M.slided_report_dt.dt.is_quarter_end,\
holdings6M.slided_report_dt,\
holdings6M.slided_report_dt + pd.tseries.offsets.QuarterEnd())
holdings12M = mfHoldings.copy(deep = True)
holdings12M = holdings12M[['crsp_portno', 'security_name', 'slided_report_dt', 'passiveWeight12M']]
holdings12M['slided_report_dt'] = holdings12M.slided_report_dt + pd.offsets.DateOffset(months = 12)
holdings12M['slided_report_dt'] = np.where(holdings12M.slided_report_dt.dt.is_quarter_end,\
holdings12M.slided_report_dt,\
holdings12M.slided_report_dt + pd.tseries.offsets.QuarterEnd())
holdings.drop(['passiveWeight12M', 'passiveWeight3M', 'passiveWeight6M'], axis = 1, inplace = True)
# =============================================================================
# Merge the three variables with passive weighst to a copy of mfHoldings
# to calculate active weight. See the explanation above for clarification
# =============================================================================
holdings = pd.merge(left = holdings, right = holdings3M,\
on = ['crsp_portno', 'security_name', 'slided_report_dt'],\
how = 'left')
holdings = pd.merge(left = holdings, right = holdings6M,\
on = ['crsp_portno', 'security_name', 'slided_report_dt'],\
how = 'left')
holdings = pd.merge(left = holdings, right = holdings12M,\
on = ['crsp_portno', 'security_name', 'slided_report_dt'],\
how = 'left')
holdings.rename({'passiveWeight12M': 'aligned_passiveWeight12M', 'passiveWeight3M': 'aligned_passiveWeight3M', 'passiveWeight6M': 'aligned_passiveWeight6M'}, axis = 1,inplace = True)
holdings['activeWeight3M'] = holdings.weight - holdings.aligned_passiveWeight3M
holdings['activeWeight6M'] = holdings.weight - holdings.aligned_passiveWeight6M
holdings['activeWeight12M'] = holdings.weight - holdings.aligned_passiveWeight12M
holdings['activeWeight3MSquared'] = holdings.activeWeight3M * holdings.activeWeight3M
holdings['activeWeight6MSquared'] = holdings.activeWeight6M * holdings.activeWeight6M
holdings['activeWeight12MSquared'] = holdings.activeWeight12M * holdings.activeWeight12M
holdings = holdings[['crsp_portno', 'permno', 'security_name', 'slided_report_dt', 'activeWeight3M', 'activeWeight6M', 'activeWeight12M', 'activeWeight3MSquared',\
'activeWeight6MSquared', 'activeWeight12MSquared']]
# =============================================================================
# Calculate aggregated active weight changes and the square thereof
# =============================================================================
aggrHoldings = holdings.groupby(['security_name', 'slided_report_dt']).agg({'permno': 'first', 'activeWeight3M': 'sum', \
'activeWeight6M': 'sum', 'activeWeight12M':'sum', 'activeWeight3MSquared': 'sum',\
'activeWeight6MSquared': 'sum', 'activeWeight12MSquared': 'sum'})
aggrHoldings.reset_index(inplace = True)
# Remove observations with missing permno
aggrHoldings = aggrHoldings[aggrHoldings.permno.notnull()]
aggrHoldings.replace([np.inf, -np.inf], np.nan, inplace = True)
aggrHoldings.dropna(inplace = True)
#Remove where all three squared aggregated weights = 0. Likely due to missing values
aggrHoldings = aggrHoldings[(aggrHoldings.activeWeight3MSquared!=0)|(aggrHoldings.activeWeight6MSquared!=0)|(aggrHoldings.activeWeight12MSquared!=0)]
#%%
# =============================================================================
# Section 6: Merge aggreHoldings (aggregated weight changes) to ibesUE (earnings surprises),
# ready for regression analysis
# Main outcome variable: aggrHoldings
# Notice that, after this section, the variable aggrHoldings contains:
# - three columns for the aggregated active weight changes relative to 3, 6 and 12 months prior
# - three columns for the square of the aggregated active weight changes relative eto 3, 6 and 12 months prior
# - three columns for the earnings surprises 3, 6, and 12 months ahead
# =============================================================================
aggrHoldings = pd.merge(left = aggrHoldings, right = ibesUE6,\
left_on = ['slided_report_dt', 'permno'],\
right_on = ['estimate3M', 'PERMNO'],\
how = 'left')
aggrHoldings = pd.merge(left = aggrHoldings, right = ibesUE7,\
left_on = ['slided_report_dt', 'permno'],\
right_on = ['estimate6M', 'PERMNO'],\
how = 'left')
aggrHoldings = pd.merge(left = aggrHoldings, right = ibesUE9,\
left_on = ['slided_report_dt', 'permno'],\
right_on = ['estimate12M', 'PERMNO'],\
how = 'left')
aggrHoldings.drop(['PERMNO_x', 'PERMNO_y', 'PERMNO', 'CNAME_x', \
'CNAME_y', 'CNAME', 'ANNDATS_x', 'ANNDATS_y', 'ANNDATS'],\
axis = 1, inplace = True)
aggrHoldings.drop(['estimate6M', 'estimate3M', 'estimate12M'], axis = 1, inplace = True)
# Remove rows where none of the earning surprised can be mapped
aggrHoldings = aggrHoldings[aggrHoldings.UE3M.notnull()|aggrHoldings.UE6M.notnull()|aggrHoldings.UE12M.notnull()]
#%%
# ==========================================================================================================================================================
# ==========================================================================================================================================================
# Section 7: Regression Analysis
# Two actions need to be taken:
# - Comment the chosen model to be run and uncomment the remaining models (see below)
# - Comment [uncomment] the chosen [unchosen] standard error clustering option (see below)
# ==========================================================================================================================================================
# ==========================================================================================================================================================
#Some preparation
from linearmodels.panel import PanelOLS
temp = aggrHoldings.copy(deep = True)
#temp = temp.replace([np.inf, -np.inf], np.nan)
temp = temp.dropna()
temp = temp.set_index(['permno', 'slided_report_dt'])
# =============================================================================
# Specify model
# Explanation:
# The variable mod is the model. To specify the model, the following format is used:
# PanelOLS(dependent variable, independent variables [, entity_effects = False, time_effects = False])
# entity_effects (firm FE) and time_effects (Time FE) are false by default
# The models have been specified below. Please uncomment the model to be run and comment the remaining models
# =============================================================================
####
#Only time effect
####
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6M']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6M']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6M']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12M']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6MSquared']], time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12MSquared']], time_effects = True)
####
#Only entity effect
####
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6M']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6M']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6M']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12M']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12MSquared']], entity_effects = True)
mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6MSquared']], entity_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12MSquared']], entity_effects = True)
#Both entity and time effect
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M', 'activeWeight6M', 'activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12M']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight3MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight6MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE3M, temp[['activeWeight12MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight3MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight6MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE6M, temp[['activeWeight12MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared', 'activeWeight6MSquared', 'activeWeight12MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight3MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight6MSquared']], entity_effects = True, time_effects = True)
#mod = PanelOLS(temp.UE12M, temp[['activeWeight12MSquared']], entity_effects = True, time_effects = True)
# =============================================================================
# Specify standard error clustering
# There are two options: cluster standard errors by
# firm (cluster_entity) or by time (cluster_time)
# =============================================================================
res = mod.fit(cov_type = 'clustered', cluster_entity = True)
#res = mod.fit(cov_type = 'clustered', cluster_time = True)
# =============================================================================
# Print the regression result
# =============================================================================
print(res)
del temp