from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_api
compq
= 'Wharton Research Data Services (WRDS)'
PROVIDER = 'https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/'
URL = 'comp'
LIBRARY = 'fundq'
TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics)
COMPANY_TABLE = 'Q'
FREQ = 1961
MIN_YEAR = None
MAX_YEAR = 'gvkey'
ENTITY_ID_IN_RAW_DSET = 'gvkey'
ENTITY_ID_IN_CLEAN_DSET = 'datadate'
TIME_VAR_IN_RAW_DSET = f'{FREQ}date' TIME_VAR_IN_CLEAN_DSET
list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS f{LIBRARY}.{TABLE}
and {LIBRARY}.{COMPANY_TABLE}
.
= list_all_vars() all_vars
"name.str.contains('naics')") all_vars.query(
name | type | wrds_library | wrds_table | |
---|---|---|---|---|
25 | naics | VARCHAR(6) | comp | company |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data
if none are specified.
print(default_raw_vars())
['datadate', 'gvkey', 'cik', 'cusip', 'fyearq', 'fqtr', 'fyr', 'sic', 'naics', 'exchg', 'rdq', 'fic', 'atq', 'req', 'xrdq', 'cheq', 'saleq', 'revtq', 'dpq', 'ibq', 'cshoq', 'ceqq', 'seqq', 'txdiq', 'ltq', 'txditcq', 'pstkq', 'pstkrq', 'lctq', 'actq', 'piq', 'niq', 'cshprq', 'epsfxq', 'opepsq', 'epsfiq', 'epspiq', 'epspxq', 'dlttq', 'dlcq', 'txtq', 'xintq', 'ppegtq', 'ppentq', 'rectq', 'invtq', 'cogsq', 'xsgaq', 'ajexq', 'prccq', 'capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvpq', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy', 'wcapq', 'oibdpq', 'tstkq', 'apdedateq', 'fdateq', 'cdvcy', 'cheq', 'intanq', 'gdwlq', 'mibq', 'oiadpq', 'ivaoq', 'npq', 'rectrq']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars
come from the {LIBRARY}.{TABLE}
table and which come from the {LIBRARY}.{COMPANY_TABLE}
table and adds a. and b. prefixes to variable names to feed into an SQL query
'atq','ltq','sic']) parse_varlist([
'b.sic,a.ltq,a.atq'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey', 'datadate', 'fyearq', 'fqtr', 'rdq'], nrows:int=None, start_date:str=None, end_date:str=None)
Downloads vars
from start_date
to end_date
from WRDS {LIBRARY}.{TABLE}
and {LIBRARY}.{COMPANY_TABLE}
. It also adds sich
and naicsh
from the annual table (comp.funda)
Type | Default | Details | |
---|---|---|---|
vars | List[str] | None | If None, downloads default_raw_vars ; use ’*’ to get all available variables |
required_vars | List[str] | [‘gvkey’, ‘datadate’, ‘fyearq’, ‘fqtr’, ‘rdq’] | list of variables that will get downloaded, even if not in vars |
nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
start_date | str | None | Start date in MM/DD/YYYY format |
end_date | str | None | End date in MM/DD/YYYY format |
Returns | pd.DataFrame |
The get_raw_data
function will produce unique gvkey-datadate
records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS')
(while we use indfmt='INDL'
).
= get_raw_data(['atq','indfmt','sic'],nrows=1)
m m
gvkey | datadate | fyearq | fqtr | rdq | indfmt | sic | atq | sich | naicsh | |
---|---|---|---|---|---|---|---|---|---|---|
0 | 001000 | 1966-03-31 | 1966.0 | 1.0 | None | INDL | 3089 | None | None | None |
= get_raw_data(vars='*', nrows=1000)
r 1) r.head(
gvkey | datadate | fyearq | fqtr | fyr | indfmt | consol | popsrc | datafmt | tic | ... | spcindcd | spcseccd | spcsrc | state | stko | weburl | dldte | ipodate | sich | naicsh | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 001345 | 1962-04-30 | 1961.0 | 4.0 | 4.0 | INDL | C | D | STD | 3ALRM | ... | 440.0 | 978.0 | None | GA | 0.0 | None | 1980-06-30 | None | None | NaN |
1 rows × 683 columns
= get_raw_data(start_date='01/01/2021', end_date='01/01/2023') raw
0) raw.head(
gvkey | datadate | fyearq | fqtr | rdq | naics | sstky | seqq | actq | saleq | ... | epsfxq | fdateq | cdvcy | wcapq | oiadpq | txdiq | gdwlq | req | sich | naicsh |
---|
0 rows × 77 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Drops duplicage, cleans up dates and applies pandasmore.setup_panel
to df
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | None | Must contain permno and datadate columns |
clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids , time_var , and freq |
Returns | pd.DataFrame |
= process_raw_data(raw) df_clean
print(df_clean.shape)
1) df_clean.head(
(95932, 79)
datadate | dtdate | dtdate_fiscal | Qdate_fiscal | fyearq | fqtr | rdq | naics | sstky | seqq | ... | epsfxq | fdateq | cdvcy | wcapq | oiadpq | txdiq | gdwlq | req | sich | naicsh | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
gvkey | Qdate | |||||||||||||||||||||
001004 | 2021Q1 | 2021-02-28 | 2021-02-28 | 2020-09-30 | 2020Q3 | 2020.0 | 3.0 | 2021-03-23 | 423860 | NaN | 932.4 | ... | 0.87 | 2021-04-05 | NaN | 640.7 | 19.5 | NaN | NaN | 685.3 | 5080 | 423860 |
1 rows × 79 columns
ytd_to_quarterly
ytd_to_quarterly (df:pandas.core.frame.DataFrame=None, vars:List[str]=['capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy'], suffix:str='_q')
Convert YTD variables to quarterly variables by taking the difference between the current and previous quarter.
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | None | |
vars | List[str] | [‘capxy’, ‘oancfy’, ‘sstky’, ‘prstkcy’, ‘dltisy’, ‘dltry’, ‘dvy’, ‘sppey’, ‘aqcy’, ‘fopty’, ‘scstkcy’] | |
suffix | str | _q | Suffix to add to the new quarterly variables |
Returns | pd.DataFrame |
= ytd_to_quarterly(df_clean)
q 0) q.head(
capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | scstkcy_q | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|
gvkey | Qdate |
features
features (df:pandas.core.frame.DataFrame=None)
Computes a set of features from df
= features(df_clean) ftrs
1) ftrs.head(
capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | ... | cflow_is | cflow_cfs | cflow_full | cflow_is_2la | cflow_cfs_2la | cflow_full_2la | cash_2a | booklev | dividends_2la | repurchases_2la | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
gvkey | Qdate | |||||||||||||||||||||
001004 | 2021Q1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 40.0 | NaN | NaN | NaN | NaN | NaN | 0.068542 | NaN | NaN | NaN |
1 rows × 44 columns