compa

Retrieve and process data from WRDS Compustat Annual

from __future__ import annotations
from typing import List

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api

PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-annual/'
LIBRARY = 'comp'
TABLE = 'funda'
COMPANY_TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics) 
FREQ = 'A'
MIN_YEAR = 1950
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'gvkey'
ENTITY_ID_IN_CLEAN_DSET = 'gvkey'
TIME_VAR_IN_RAW_DSET = 'datadate'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'

source

list_all_vars

 list_all_vars ()

Collects names of all available variables from WRDS f{LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}.

all_vars = list_all_vars()

all_vars.loc[all_vars.name.str.contains('conm'),:]

	name	type	wrds_library	wrds_table
9	conm	VARCHAR(70)	comp	funda
0	conm	VARCHAR(70)	comp	company
10	conml	VARCHAR(100)	comp	company

source

default_raw_vars

 default_raw_vars ()

Defines default variables used in get_raw_data if none are specified.

print(default_raw_vars())

['datadate', 'gvkey', 'cusip', 'cik', 'tic', 'fyear', 'fyr', 'fic', 'naicsh', 'sich', 'sic', 'naics', 'exchg', 'lt', 'at', 'txditc', 'pstkl', 'pstkrv', 'pstk', 'csho', 'ajex', 'rdip', 'act', 'dvc', 'xad', 'seq', 'che', 'lct', 'dlc', 'ib', 'dvp', 'txdi', 'dp', 'txp', 'oancf', 'ivncf', 'fincf', 'dltt', 'mib', 'ceq', 'invt', 'cogs', 'revt', 'sale', 'capx', 'xrd', 'txdb', 'prcc_f', 'sstk', 'prstkc', 'dltis', 'dltr', 'emp', 'dd1', 'ppegt', 'ppent', 'xint', 'txt', 'sppe', 'gdwl', 'xrent', 're', 'dvpsx_f', 'tstk', 'wcap', 'rect', 'xsga', 'aqc', 'oibdp', 'dpact', 'ni', 'ivao', 'ivst', 'dv', 'intan', 'pi', 'txfo', 'pifo', 'xpp', 'drc', 'drlt', 'ap', 'xacc', 'itcb']

source

parse_varlist

 parse_varlist (vars:List[str]=None, required_vars=[])

Figures out which vars come from the {LIBRARY}.{TABLE} table and which come from the {LIBRARY}.{COMPANY_TABLE} table and adds a. and b. prefixes to variable names to feed into an SQL query

parse_varlist(['at','lt','sic'])

'a.lt,a.at,b.sic'

source

get_raw_data

 get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey',
               'datadate'], nrows:int=None, start_date:str=None,
               end_date:str=None)

Downloads vars from start_date to end_date from WRDS {LIBRARY}.{TABLE} library

	Type	Default	Details
vars	List[str]	None	If None, downloads `default_raw_vars`; use ’*’ to get all available variables
required_vars	List[str]	[‘gvkey’, ‘datadate’]	list of variables that will get downloaded, even if not in `vars`
nrows	int	None	Number of rows to download. If None, full dataset will be downloaded
start_date	str	None	Start date in MM/DD/YYYY format
end_date	str	None	End date in MM/DD/YYYY format
Returns	pd.DataFrame

The get_raw_data function will produce unique gvkey-datadate records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS') (while we use indfmt='INDL').

m = get_raw_data(['at','indfmt','sich','sic'],nrows=1)
m

	gvkey	datadate	indfmt	sich	at	sic
0	001000	1961-12-31	INDL	None	None	3089

r = get_raw_data(vars='*', nrows=1)
r

	gvkey	datadate	fyear	indfmt	consol	popsrc	datafmt	tic	cusip	conm	...	priusa	sic	spcindcd	spcseccd	spcsrc	state	stko	weburl	dldte	ipodate
0	001000	1961-12-31	1961.0	INDL	C	D	STD	AE.2	000032102	A & E PLASTIK PAK INC	...	01	3089	325.0	978.0	None	None	0.0	None	1978-06-30	None

1 rows × 982 columns

raw = get_raw_data(vars = ['at', 'lt'], start_date='01/01/2021', end_date='01/01/2022', nrows=1)

raw.head(0)

	gvkey	datadate	lt	at

raw = get_raw_data(start_date='01/01/2021', end_date='01/01/2023')

raw.head(0)

	gvkey	datadate	ceq	dltr	exchg	act	prcc_f	dp	aqc	at	...	wcap	re	cik	dlc	pi	ap	rdip	xint	rect	sppe

0 rows × 84 columns

source

process_raw_data

 process_raw_data (df:pandas.core.frame.DataFrame=None,
                   clean_kwargs:dict={})

Applies pandasmore.setup_panel to df

	Type	Default	Details
df	pd.DataFrame	None	Must contain `permno` and `datadate` columns
clean_kwargs	dict	{}	Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
Returns	pd.DataFrame

df_clean = process_raw_data(raw)

df_clean.head(0)

		datadate	dtdate	ceq	dltr	exchg	act	prcc_f	dp	aqc	at	...	wcap	re	cik	dlc	pi	ap	rdip	xint	rect	sppe
gvkey	Adate

0 rows × 84 columns

source

features

 features (df:pandas.core.frame.DataFrame=None)

ftrs = features(df_clean)

ftrs.head(0)

		sic_full	naics_full	stock_price	lag_at	mktcap	pstk0	pref_stock	shreq	bookeq	bookeq_w_itcb	...	cflow_is	cflow_cfs	cflow_full	cflow_is_2la	cflow_cfs_2la	cflow_full_2la	cash_2a	booklev	dividends_2la	repurchases_2la
gvkey	Adate

0 rows × 34 columns