from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_apicompq_ccm
PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/crspcompustat-merged/fundamentals-quarterly/'
LIBRARY = 'comp'
TABLE = 'fundq'
LINK_LIBRARY = 'crsp'
LINK_TABLE = 'ccmxpf_lnkhist'
COMPANY_TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics)
FREQ = 'Q'
MIN_YEAR = 1961
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'permno'
TIME_VAR_IN_RAW_DSET = 'datadate'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS f{LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}.
all_vars = list_all_vars()all_vars.query("name.str.contains('naics')")| name | type | wrds_library | wrds_table | |
|---|---|---|---|---|
| 25 | naics | VARCHAR(6) | comp | company |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data if none are specified.
print(default_raw_vars())['datadate', 'gvkey', 'cik', 'cusip', 'fyearq', 'fqtr', 'fyr', 'sic', 'naics', 'exchg', 'rdq', 'fic', 'atq', 'req', 'xrdq', 'cheq', 'saleq', 'revtq', 'dpq', 'ibq', 'cshoq', 'ceqq', 'seqq', 'txdiq', 'ltq', 'txditcq', 'pstkq', 'pstkrq', 'lctq', 'actq', 'piq', 'niq', 'cshprq', 'epsfxq', 'opepsq', 'epsfiq', 'epspiq', 'epspxq', 'dlttq', 'dlcq', 'txtq', 'xintq', 'ppegtq', 'ppentq', 'rectq', 'invtq', 'cogsq', 'xsgaq', 'ajexq', 'prccq', 'capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvpq', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy', 'wcapq', 'oibdpq', 'tstkq', 'apdedateq', 'fdateq', 'cdvcy', 'cheq', 'intanq', 'gdwlq', 'mibq', 'oiadpq', 'ivaoq', 'npq', 'rectrq']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars come from the {LIBRARY}.{TABLE} table and which come from the {LIBRARY}.{COMPANY_TABLE} table and adds a. and b. prefixes to variable names to feed into an SQL query
parse_varlist(['atq','ltq','sic'])'a.ltq,a.atq,b.sic'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey', 'datadate', 'fyearq', 'fqtr', 'rdq'], nrows:int=None, start_date:str=None, end_date:str=None)
Downloads vars from start_date to end_date from WRDS {LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}. It also adds sich and naicsh from the annual table (comp.funda)
| Type | Default | Details | |
|---|---|---|---|
| vars | List[str] | None | If None, downloads default_raw_vars; use ’*’ to get all available variables |
| required_vars | List[str] | [‘gvkey’, ‘datadate’, ‘fyearq’, ‘fqtr’, ‘rdq’] | list of variables that will get downloaded, even if not in vars |
| nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
| start_date | str | None | Start date in MM/DD/YYYY format |
| end_date | str | None | End date in MM/DD/YYYY format |
| Returns | pd.DataFrame |
The get_raw_data function will produce unique gvkey-datadate records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS') (while we use indfmt='INDL').
m = get_raw_data(['atq','indfmt','sic'],nrows=1)
m| gvkey | datadate | fyearq | fqtr | rdq | indfmt | atq | sic | sich | naicsh | permno | permco | liid | linkprim | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 001000 | 1970-12-31 | 1970.0 | 4.0 | None | INDL | None | 3089 | None | None | 25881.0 | 23369.0 | 01 | P |
r = get_raw_data(vars='*', nrows=1)
r| gvkey | datadate | fyearq | fqtr | fyr | indfmt | consol | popsrc | datafmt | tic | ... | stko | weburl | dldte | ipodate | sich | naicsh | permno | permco | liid | linkprim | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 001000 | 1970-12-31 | 1970.0 | 4.0 | 12.0 | INDL | C | D | STD | AE.2 | ... | 0.0 | None | 1978-06-30 | None | None | None | 25881.0 | 23369.0 | 01 | P |
1 rows × 687 columns
raw = get_raw_data(start_date='01/01/2021', end_date='01/01/2023')raw.head(1)| gvkey | datadate | fyearq | fqtr | rdq | exchg | intanq | txditcq | oibdpq | apdedateq | ... | cik | wcapq | txtq | cheq | sich | naicsh | permno | permco | liid | linkprim | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 040237 | 2022-03-31 | 2022.0 | 1.0 | None | 14.0 | NaN | NaN | NaN | None | ... | 0001838716 | NaN | NaN | NaN | 2836.0 | 325414.0 | 22557.0 | 59079.0 | 90 | P |
1 rows × 81 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Drops duplicates, cleans up dates and applies pandasmore.setup_panel to df
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | None | Must contain permno and datadate columns |
| clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids, time_var, and freq |
| Returns | pd.DataFrame |
df_clean = process_raw_data(raw)print(df_clean.shape)
df_clean.head(1)(46432, 83)
| datadate | dtdate | dtdate_fiscal | Qdate_fiscal | fyearq | fqtr | rdq | gvkey | exchg | intanq | ... | sppey | cik | wcapq | txtq | cheq | sich | naicsh | permco | liid | linkprim | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| permno | Qdate | |||||||||||||||||||||
| 10026 | 2021Q1 | 2021-03-31 | 2021-03-31 | 2021-06-30 | 2021Q2 | 2021.0 | 2.0 | 2021-04-26 | 012825 | 14.0 | 202.138 | ... | 1.262 | 0000785956 | 387.072 | 1.752 | 259.765 | 2050 | 311812 | 7976 | 01 | P |
1 rows × 83 columns
ytd_to_quarterly
ytd_to_quarterly (df:pandas.core.frame.DataFrame=None, vars:List[str]=['capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy'], suffix:str='_q')
Convert YTD variables to quarterly variables by taking the difference between the current and previous quarter.
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | None | |
| vars | List[str] | [‘capxy’, ‘oancfy’, ‘sstky’, ‘prstkcy’, ‘dltisy’, ‘dltry’, ‘dvy’, ‘sppey’, ‘aqcy’, ‘fopty’, ‘scstkcy’] | |
| suffix | str | _q | Suffix to add to the new quarterly variables |
| Returns | pd.DataFrame |
q = ytd_to_quarterly(df_clean)
q.head(0)| capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | scstkcy_q | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| permno | Qdate |
features
features (df:pandas.core.frame.DataFrame=None)
Computes a set of features from df
ftrs = features(df_clean)ftrs.head(1)| capxy_q | oancfy_q | sstky_q | prstkcy_q | dltisy_q | dltry_q | dvy_q | sppey_q | aqcy_q | fopty_q | ... | cflow_is | cflow_cfs | cflow_full | cflow_is_2la | cflow_cfs_2la | cflow_full_2la | cash_2a | booklev | dividends_2la | repurchases_2la | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| permno | Qdate | |||||||||||||||||||||
| 10026 | 2021Q1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 18.823 | NaN | NaN | NaN | NaN | NaN | 0.24393 | 0.053648 | NaN | NaN |
1 rows × 44 columns