from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_apicompq
PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/'
LIBRARY = 'comp'
TABLE = 'fundq'
COMPANY_TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics)
FREQ = 'Q'
MIN_YEAR = 1961
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'gvkey'
ENTITY_ID_IN_CLEAN_DSET = 'gvkey'
TIME_VAR_IN_RAW_DSET = 'datadate'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS f{LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}.
all_vars = list_all_vars()all_vars.query("name.str.contains('naics')")| name | type | wrds_library | wrds_table | |
|---|---|---|---|---|
| 25 | naics | VARCHAR(6) | comp | company |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data if none are specified.
print(default_raw_vars())['datadate', 'gvkey', 'cik', 'cusip', 'fyearq', 'fqtr', 'fyr', 'sic', 'naics', 'exchg', 'rdq', 'fic', 'atq', 'req', 'xrdq', 'cheq', 'saleq', 'revtq', 'dpq', 'ibq', 'cshoq', 'ceqq', 'seqq', 'txdiq', 'ltq', 'txditcq', 'pstkq', 'pstkrq', 'lctq', 'actq', 'piq', 'niq', 'cshprq', 'epsfxq', 'opepsq', 'epsfiq', 'epspiq', 'epspxq', 'dlttq', 'dlcq', 'txtq', 'xintq', 'ppegtq', 'ppentq', 'rectq', 'invtq', 'cogsq', 'xsgaq', 'ajexq', 'prccq', 'capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvpq', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy', 'wcapq', 'oibdpq', 'tstkq', 'apdedateq', 'fdateq', 'cdvcy', 'cheq', 'intanq', 'gdwlq', 'mibq', 'oiadpq', 'ivaoq', 'npq', 'rectrq']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars come from the {LIBRARY}.{TABLE} table and which come from the {LIBRARY}.{COMPANY_TABLE} table and adds a. and b. prefixes to variable names to feed into an SQL query
parse_varlist(['atq','ltq','sic'])'b.sic,a.ltq,a.atq'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey', 'datadate', 'fyearq', 'fqtr', 'rdq'], nrows:int=None, start_date:str=None, end_date:str=None)
Downloads vars from start_date to end_date from WRDS {LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}. It also adds sich and naicsh from the annual table (comp.funda)
| Type | Default | Details | |
|---|---|---|---|
| vars | List[str] | None | If None, downloads default_raw_vars; use ’*’ to get all available variables |
| required_vars | List[str] | [‘gvkey’, ‘datadate’, ‘fyearq’, ‘fqtr’, ‘rdq’] | list of variables that will get downloaded, even if not in vars |
| nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
| start_date | str | None | Start date in MM/DD/YYYY format |
| end_date | str | None | End date in MM/DD/YYYY format |
| Returns | pd.DataFrame |
The get_raw_data function will produce unique gvkey-datadate records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS') (while we use indfmt='INDL').
m = get_raw_data(['atq','indfmt','sic'],nrows=1)
m| gvkey | datadate | fyearq | fqtr | rdq | indfmt | sic | atq | sich | naicsh | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 001000 | 1966-03-31 | 1966.0 | 1.0 | None | INDL | 3089 | None | None | None |
r = get_raw_data(vars='*', nrows=1000)
r.head(1)| gvkey | datadate | fyearq | fqtr | fyr | indfmt | consol | popsrc | datafmt | tic | ... | spcindcd | spcseccd | spcsrc | state | stko | weburl | dldte | ipodate | sich | naicsh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 001345 | 1962-04-30 | 1961.0 | 4.0 | 4.0 | INDL | C | D | STD | 3ALRM | ... | 440.0 | 978.0 | None | GA | 0.0 | None | 1980-06-30 | None | None | NaN |
1 rows × 683 columns
raw = get_raw_data(start_date='01/01/2021', end_date='01/01/2023')raw.head(0)| gvkey | datadate | fyearq | fqtr | rdq | naics | sstky | seqq | actq | saleq | ... | epsfxq | fdateq | cdvcy | wcapq | oiadpq | txdiq | gdwlq | req | sich | naicsh |
|---|
0 rows × 77 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Drops duplicage, cleans up dates and applies pandasmore.setup_panel to df
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | None | Must contain permno and datadate columns |
| clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids, time_var, and freq |
| Returns | pd.DataFrame |
df_clean = process_raw_data(raw)print(df_clean.shape)
df_clean.head(1)(95932, 79)
| datadate | dtdate | dtdate_fiscal | Qdate_fiscal | fyearq | fqtr | rdq | naics | sstky | seqq | ... | epsfxq | fdateq | cdvcy | wcapq | oiadpq | txdiq | gdwlq | req | sich | naicsh | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gvkey | Qdate | |||||||||||||||||||||
| 001004 | 2021Q1 | 2021-02-28 | 2021-02-28 | 2020-09-30 | 2020Q3 | 2020.0 | 3.0 | 2021-03-23 | 423860 | NaN | 932.4 | ... | 0.87 | 2021-04-05 | NaN | 640.7 | 19.5 | NaN | NaN | 685.3 | 5080 | 423860 |
1 rows × 79 columns
ytd_to_quarterly
ytd_to_quarterly (df:pandas.core.frame.DataFrame=None, vars:List[str]=['capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy'], suffix:str='_q')
Convert YTD variables to quarterly variables by taking the difference between the current and previous quarter.
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | None | |
| vars | List[str] | [‘capxy’, ‘oancfy’, ‘sstky’, ‘prstkcy’, ‘dltisy’, ‘dltry’, ‘dvy’, ‘sppey’, ‘aqcy’, ‘fopty’, ‘scstkcy’] | |
| suffix | str | _q | Suffix to add to the new quarterly variables |
| Returns | pd.DataFrame |
q = ytd_to_quarterly(df_clean)
q.head(0)| capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | scstkcy_q | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gvkey | Qdate |
features
features (df:pandas.core.frame.DataFrame=None)
Computes a set of features from df
ftrs = features(df_clean)ftrs.head(1)| capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | ... | cflow_is | cflow_cfs | cflow_full | cflow_is_2la | cflow_cfs_2la | cflow_full_2la | cash_2a | booklev | dividends_2la | repurchases_2la | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| gvkey | Qdate | |||||||||||||||||||||
| 001004 | 2021Q1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 40.0 | NaN | NaN | NaN | NaN | NaN | 0.068542 | NaN | NaN | NaN |
1 rows × 44 columns