from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_api
crspm
Since this is a proprietary dataset, in the documentation below, I can not show any of the data that is being retrieved/generated (will show only column names).
= 'Wharton Research Data Services (WRDS)'
PROVIDER = 'https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/stock-security-files/monthly-stock-file/'
URL = 'crsp'
LIBRARY = 'msf'
TABLE = 'msenames'
NAMES_TABLE = 'msedelist'
DELIST_TABLE = 'M'
FREQ = 1925
MIN_YEAR = None
MAX_YEAR = 'permno'
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'date'
TIME_VAR_IN_RAW_DSET = f'{FREQ}date' TIME_VAR_IN_CLEAN_DSET
list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS {LIBRARY}.{TABLE}
and {LIBRARY}.{NAMES_TABLE}
= list_all_vars() all_vars
all_vars.name.count()
61
all_vars.head()
name | type | wrds_library | wrds_table | |
---|---|---|---|---|
0 | cusip | VARCHAR(8) | crsp | msf |
1 | permno | DOUBLE_PRECISION | crsp | msf |
2 | permco | DOUBLE_PRECISION | crsp | msf |
3 | issuno | DOUBLE_PRECISION | crsp | msf |
4 | hexcd | DOUBLE_PRECISION | crsp | msf |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data
if none are specified.
print(default_raw_vars())
['permno', 'permco', 'date', 'ret', 'retx', 'shrout', 'prc', 'shrcd', 'exchcd', 'siccd', 'naics', 'ticker', 'cusip', 'ncusip', 'dlret', 'dlstcd', 'dlstdt']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars
come from the {LIBRARY}.{TABLE}
table and which come from the {LIBRARY}.{NAMES_TABLE}
table and adds a. and b. prefixes to variable names to feed into an SQL query
'ret', 'siccd', 'naics', 'dlret']) parse_varlist([
'c.dlret,b.naics,b.siccd,a.ret'
'*') parse_varlist(
'a.cusip,a.permno,a.permco,a.issuno,a.hexcd,a.hsiccd,a.date,a.bidlo,a.askhi,a.prc,a.vol,a.ret,a.bid,a.ask,a.shrout,a.cfacpr,a.cfacshr,a.altprc,a.spread,a.altprcdt,a.retx,b.namedt,b.nameendt,b.shrcd,b.exchcd,b.siccd,b.ncusip,b.ticker,b.comnam,b.shrcls,b.tsymbol,b.naics,b.primexch,b.trdstat,b.secstat,b.compno,c.dlstdt,c.dlstcd,c.nwperm,c.nwcomp,c.nextdt,c.dlamt,c.dlretx,c.dlprc,c.dlpdt,c.dlret,c.acperm,c.accomp'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars=['permno', 'date'], nrows:int=None, start_date:str=None, end_date:str=None)
Downloads vars
from start_date
to end_date
from WRDS {LIBRARY}.{TABLE}, {LIBRARY}.{NAMES_TABLE} and {LIBRARY}.{DELIST_TABLE}.
Type | Default | Details | |
---|---|---|---|
vars | List[str] | None | If None, downloads default_raw_vars ; use ’*’ to get all available variables |
required_vars | list | [‘permno’, ‘date’] | Variables that are always downloaded, regardless vars argument |
nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
start_date | str | None | Start date in MM/DD/YYYY format |
end_date | str | None | End date in MM/DD/YYYY format |
Returns | pd.DataFrame |
= get_raw_data(vars = '*', nrows=100)
r 1) r.head(
cusip | permno | permco | issuno | hexcd | hsiccd | date | bidlo | askhi | prc | ... | nwperm | nwcomp | nextdt | dlamt | dlretx | dlprc | dlpdt | dlret | acperm | accomp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 68391610 | 10000.0 | 7952.0 | 10396.0 | 3.0 | 3990.0 | 1985-12-31 | NaN | NaN | NaN | ... | NaN | NaN | None | NaN | NaN | NaN | None | NaN | None | None |
1 rows × 48 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Applies pandasmore.setup_panel
to df
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | None | Must contain permno and date columns |
clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids , time_var , and freq |
Returns | pd.DataFrame |
= process_raw_data(r) df
0) df.head(
date | dtdate | cusip | permco | issuno | hexcd | hsiccd | bidlo | askhi | prc | ... | nwperm | nwcomp | nextdt | dlamt | dlretx | dlprc | dlpdt | dlret | acperm | accomp | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
permno | Mdate |
0 rows × 48 columns
delist_adj_ret
delist_adj_ret (df:pandas.core.frame.DataFrame, adj_ret_var:str='ret_adj')
Adjusts for returns for delisting using Shumway and Warther (1999) and Johnson and Zhao (2007)
Type | Default | Details | |
---|---|---|---|
df | pd.DataFrame | Requires ret ,exchcd ,dlret ,dlstcd , and dlstdt variables |
|
adj_ret_var | str | ret_adj | Name of the adjusted return variable created by this function |
Returns | pd.DataFrame |
= delist_adj_ret(process_raw_data(get_raw_data(['dlret','dlstcd','dlstdt','exchcd','ret','siccd','naics'], nrows=1000)))
dl 0) dl.head(
date | dtdate | naics | dlret | dlstdt | exchcd | dlstcd | ret | siccd | ret_adj | ||
---|---|---|---|---|---|---|---|---|---|---|---|
permno | Mdate |
features
features (df:pandas.core.frame.DataFrame)
= features(df) ftr
1) ftr.head(
ret_adj | lbhret12 | retvol12 | ||
---|---|---|---|---|
permno | Mdate | |||
10000 | 1985-12 | NaN | NaN | NaN |