from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_api
compq_ccm
= 'Wharton Research Data Services (WRDS)'
PROVIDER = 'https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/crspcompustat-merged/fundamentals-quarterly/'
URL = 'comp'
LIBRARY = 'fundq'
TABLE = 'crsp'
LINK_LIBRARY = 'ccmxpf_lnkhist'
LINK_TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics)
COMPANY_TABLE = 'Q'
FREQ = 1961
MIN_YEAR = None
MAX_YEAR = 'permno'
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'datadate'
TIME_VAR_IN_RAW_DSET = f'{FREQ}date' TIME_VAR_IN_CLEAN_DSET
list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS f{LIBRARY}.{TABLE}
and {LIBRARY}.{COMPANY_TABLE}
.
= list_all_vars() all_vars
"name.str.contains('naics')") all_vars.query(
name | type | wrds_library | wrds_table | |
---|---|---|---|---|
25 | naics | VARCHAR(6) | comp | company |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data
if none are specified.
print(default_raw_vars())
['datadate', 'gvkey', 'cik', 'cusip', 'fyearq', 'fqtr', 'fyr', 'sic', 'naics', 'exchg', 'rdq', 'fic', 'atq', 'req', 'xrdq', 'cheq', 'saleq', 'revtq', 'dpq', 'ibq', 'cshoq', 'ceqq', 'seqq', 'txdiq', 'ltq', 'txditcq', 'pstkq', 'pstkrq', 'lctq', 'actq', 'piq', 'niq', 'cshprq', 'epsfxq', 'opepsq', 'epsfiq', 'epspiq', 'epspxq', 'dlttq', 'dlcq', 'txtq', 'xintq', 'ppegtq', 'ppentq', 'rectq', 'invtq', 'cogsq', 'xsgaq', 'ajexq', 'prccq', 'capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvpq', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy', 'wcapq', 'oibdpq', 'tstkq', 'apdedateq', 'fdateq', 'cdvcy', 'cheq', 'intanq', 'gdwlq', 'mibq', 'oiadpq', 'ivaoq', 'npq', 'rectrq']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars
come from the {LIBRARY}.{TABLE}
table and which come from the {LIBRARY}.{COMPANY_TABLE}
table and adds a. and b. prefixes to variable names to feed into an SQL query
'atq','ltq','sic']) parse_varlist([
'a.ltq,a.atq,b.sic'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey', 'datadate', 'fyearq', 'fqtr', 'rdq'], nrows:int=None, start_date:str=None, end_date:str=None)
Downloads vars
from start_date
to end_date
from WRDS {LIBRARY}.{TABLE}
and {LIBRARY}.{COMPANY_TABLE}
. It also adds sich
and naicsh
from the annual table (comp.funda)
Type | Default | Details | |
---|---|---|---|
vars | List[str] | None | If None, downloads default_raw_vars ; use ’*’ to get all available variables |
required_vars | List[str] | [‘gvkey’, ‘datadate’, ‘fyearq’, ‘fqtr’, ‘rdq’] | list of variables that will get downloaded, even if not in vars |
nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
start_date | str | None | Start date in MM/DD/YYYY format |
end_date | str | None | End date in MM/DD/YYYY format |
Returns | pd.DataFrame |
The get_raw_data
function will produce unique gvkey-datadate
records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS')
(while we use indfmt='INDL'
).
= get_raw_data(['atq','indfmt','sic'],nrows=1)
m m
gvkey | datadate | fyearq | fqtr | rdq | indfmt | atq | sic | sich | naicsh | permno | permco | liid | linkprim | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 001000 | 1970-12-31 | 1970.0 | 4.0 | None | INDL | None | 3089 | None | None | 25881.0 | 23369.0 | 01 | P |
= get_raw_data(vars='*', nrows=1)
r r
gvkey | datadate | fyearq | fqtr | fyr | indfmt | consol | popsrc | datafmt | tic | ... | stko | weburl | dldte | ipodate | sich | naicsh | permno | permco | liid | linkprim | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 001000 | 1970-12-31 | 1970.0 | 4.0 | 12.0 | INDL | C | D | STD | AE.2 | ... | 0.0 | None | 1978-06-30 | None | None | None | 25881.0 | 23369.0 | 01 | P |
1 rows × 687 columns
= get_raw_data(start_date='01/01/2021', end_date='01/01/2023') raw
1) raw.head(
gvkey | datadate | fyearq | fqtr | rdq | exchg | intanq | txditcq | oibdpq | apdedateq | ... | cik | wcapq | txtq | cheq | sich | naicsh | permno | permco | liid | linkprim | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 040237 | 2022-03-31 | 2022.0 | 1.0 | None | 14.0 | NaN | NaN | NaN | None | ... | 0001838716 | NaN | NaN | NaN | 2836.0 | 325414.0 | 22557.0 | 59079.0 | 90 | P |
1 rows × 81 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Drops duplicates, cleans up dates and applies pandasmore.setup_panel
to df
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | None | Must contain permno and datadate columns |
clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids , time_var , and freq |
Returns | pd.DataFrame |
= process_raw_data(raw) df_clean
print(df_clean.shape)
1) df_clean.head(
(46432, 83)
datadate | dtdate | dtdate_fiscal | Qdate_fiscal | fyearq | fqtr | rdq | gvkey | exchg | intanq | ... | sppey | cik | wcapq | txtq | cheq | sich | naicsh | permco | liid | linkprim | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
permno | Qdate | |||||||||||||||||||||
10026 | 2021Q1 | 2021-03-31 | 2021-03-31 | 2021-06-30 | 2021Q2 | 2021.0 | 2.0 | 2021-04-26 | 012825 | 14.0 | 202.138 | ... | 1.262 | 0000785956 | 387.072 | 1.752 | 259.765 | 2050 | 311812 | 7976 | 01 | P |
1 rows × 83 columns
ytd_to_quarterly
ytd_to_quarterly (df:pandas.core.frame.DataFrame=None, vars:List[str]=['capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy'], suffix:str='_q')
Convert YTD variables to quarterly variables by taking the difference between the current and previous quarter.
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | None | |
vars | List[str] | [‘capxy’, ‘oancfy’, ‘sstky’, ‘prstkcy’, ‘dltisy’, ‘dltry’, ‘dvy’, ‘sppey’, ‘aqcy’, ‘fopty’, ‘scstkcy’] | |
suffix | str | _q | Suffix to add to the new quarterly variables |
Returns | pd.DataFrame |
= ytd_to_quarterly(df_clean)
q 0) q.head(
capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | scstkcy_q | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
permno | Qdate |
features
features (df:pandas.core.frame.DataFrame=None)
Computes a set of features from df
= features(df_clean) ftrs
1) ftrs.head(
capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | ... | cflow_is | cflow_cfs | cflow_full | cflow_is_2la | cflow_cfs_2la | cflow_full_2la | cash_2a | booklev | dividends_2la | repurchases_2la | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
permno | Qdate | |||||||||||||||||||||
10026 | 2021Q1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 18.823 | NaN | NaN | NaN | NaN | NaN | 0.24393 | 0.053648 | NaN | NaN |
1 rows × 44 columns