from __future__ import annotations
from typing import List
import pandas as pd
import numpy as np
import pandasmore as pdm
from finsets.wrds import wrds_apicrspd
Since this is a proprietary dataset, in the documentation below, I can not show any of the data that is being retrieved/generated (will show only column names).
PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/center-research-security-prices-crsp/annual-update/stock-security-files/daily-stock-file/'
LIBRARY = 'crsp'
TABLE = 'dsf'
NAMES_TABLE = 'dsenames'
DELIST_TABLE = 'dsedelist'
FREQ = 'D'
MIN_YEAR = 1925
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'permno'
ENTITY_ID_IN_CLEAN_DSET = 'permno'
TIME_VAR_IN_RAW_DSET = 'date'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'list_all_vars
list_all_vars ()
Collects names of all available variables from WRDS {LIBRARY}.{TABLE} and {LIBRARY}.{NAMES_TABLE}
all_vars = list_all_vars()all_vars.name.count()60
all_vars.head()| name | type | wrds_library | wrds_table | |
|---|---|---|---|---|
| 0 | cusip | VARCHAR(8) | crsp | dsf |
| 1 | permno | DOUBLE_PRECISION | crsp | dsf |
| 2 | permco | DOUBLE_PRECISION | crsp | dsf |
| 3 | issuno | DOUBLE_PRECISION | crsp | dsf |
| 4 | hexcd | DOUBLE_PRECISION | crsp | dsf |
default_raw_vars
default_raw_vars ()
Defines default variables used in get_raw_data if none are specified.
print(default_raw_vars())['permno', 'permco', 'date', 'ret', 'retx', 'shrout', 'prc', 'shrcd', 'exchcd', 'cfacpr', 'cfacshr', 'dlret', 'dlstcd', 'dlstdt']
parse_varlist
parse_varlist (vars:List[str]=None, required_vars=[])
Figures out which vars come from the {LIBRARY}.{TABLE} table and which come from the {LIBRARY}.{NAMES_TABLE} table and adds a. and b. prefixes to variable names to feed into an SQL query
parse_varlist(['ret', 'siccd', 'dlret'])'c.dlret,a.ret,b.siccd'
parse_varlist('*')'a.cusip,a.permno,a.permco,a.issuno,a.hexcd,a.hsiccd,a.date,a.bidlo,a.askhi,a.prc,a.vol,a.ret,a.bid,a.ask,a.shrout,a.cfacpr,a.cfacshr,a.openprc,a.numtrd,a.retx,b.namedt,b.nameendt,b.shrcd,b.exchcd,b.siccd,b.ncusip,b.ticker,b.comnam,b.shrcls,b.tsymbol,b.naics,b.primexch,b.trdstat,b.secstat,b.compno,c.dlstdt,c.dlstcd,c.nwperm,c.nwcomp,c.nextdt,c.dlamt,c.dlretx,c.dlprc,c.dlpdt,c.dlret,c.acperm,c.accomp'
get_raw_data
get_raw_data (vars:List[str]=None, required_vars=['permno', 'date'], nrows:int=None, start_date:str='01/01/1950', end_date:str=None, shrcd_exchcd_filters:bool=True)
Downloads vars from start_date to end_date from WRDS {LIBRARY}.{TABLE}, {LIBRARY}.{NAMES_TABLE} and {LIBRARY}.{DELIST_TABLE}.
| Type | Default | Details | |
|---|---|---|---|
| vars | List[str] | None | If None, downloads default_raw_vars; use ’*’ to get all available variables |
| required_vars | list | [‘permno’, ‘date’] | Variables that are always downloaded, regardless vars argument |
| nrows | int | None | Number of rows to download. If None, full dataset will be downloaded |
| start_date | str | 01/01/1950 | Start date in MM/DD/YYYY format |
| end_date | str | None | End date in MM/DD/YYYY format |
| shrcd_exchcd_filters | bool | True | If true, keep only observations with shrcd in [10,11] and exchcd in [1,2,3] |
| Returns | pd.DataFrame |
r = get_raw_data(vars = '*', nrows=1000)
r.head(1)| cusip | permno | permco | issuno | hexcd | hsiccd | date | bidlo | askhi | prc | ... | nwperm | nwcomp | nextdt | dlamt | dlretx | dlprc | dlpdt | dlret | acperm | accomp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68391610 | 10000.0 | 7952.0 | 10396.0 | 3.0 | 3990.0 | 1986-01-07 | 2.375 | 2.75 | -2.5625 | ... | NaN | NaN | None | NaN | NaN | NaN | None | NaN | None | None |
1 rows × 47 columns
process_raw_data
process_raw_data (df:pandas.core.frame.DataFrame=None, clean_kwargs:dict={})
Applies pandasmore.setup_panel to df
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | None | Must contain permno and date columns |
| clean_kwargs | dict | {} | Params to pass to pdm.setup_panel other than panel_ids, time_var, and freq |
| Returns | pd.DataFrame |
df = process_raw_data(r)df.head(0)| date | dtdate | cusip | permco | issuno | hexcd | hsiccd | bidlo | askhi | prc | ... | nwperm | nwcomp | nextdt | dlamt | dlretx | dlprc | dlpdt | dlret | acperm | accomp | ||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| permno | Ddate |
0 rows × 47 columns
delist_adj_ret
delist_adj_ret (df:pandas.core.frame.DataFrame, adj_ret_var:str='ret_adj')
Adjusts for returns for delisting using Shumway and Warther (1999) and Johnson and Zhao (2007)
| Type | Default | Details | |
|---|---|---|---|
| df | pd.DataFrame | Requires ret,exchcd,dlret,dlstcd, and dlstdt variables |
|
| adj_ret_var | str | ret_adj | Name of the adjusted return variable created by this function |
| Returns | pd.DataFrame |
dl = delist_adj_ret(process_raw_data(get_raw_data(['dlret','dlstcd','dlstdt','exchcd','ret','naics','siccd'], nrows=1000)))
dl.head(0)| date | dtdate | exchcd | dlstcd | naics | dlret | siccd | dlstdt | ret | ret_adj | ||
|---|---|---|---|---|---|---|---|---|---|---|---|
| permno | Ddate |
features
features (df:pandas.core.frame.DataFrame)
ftr = features(df)ftr.head(0)| ret_adj | lbhret12 | retvol12 | ||
|---|---|---|---|---|
| permno | Ddate |