compq

Retrieve and process data from WRDS Compustat Quarterly database.

from __future__ import annotations
from typing import List

import pandas as pd
import numpy as np

import pandasmore as pdm
from finsets.wrds import wrds_api

PROVIDER = 'Wharton Research Data Services (WRDS)'
URL = 'https://wrds-www.wharton.upenn.edu/pages/get-data/compustat-capital-iq-standard-poors/compustat/north-america-daily/fundamentals-quarterly/'
LIBRARY = 'comp'
TABLE = 'fundq'
COMPANY_TABLE = 'company' #contains some header information that is missing from comp.funda (e.g. sic and naics) 
FREQ = 'Q'
MIN_YEAR = 1961
MAX_YEAR = None
ENTITY_ID_IN_RAW_DSET = 'gvkey'
ENTITY_ID_IN_CLEAN_DSET = 'gvkey'
TIME_VAR_IN_RAW_DSET = 'datadate'
TIME_VAR_IN_CLEAN_DSET = f'{FREQ}date'

source

list_all_vars

 list_all_vars ()

Collects names of all available variables from WRDS f{LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}.

all_vars = list_all_vars()

all_vars.query("name.str.contains('naics')")

	name	type	wrds_library	wrds_table
25	naics	VARCHAR(6)	comp	company

source

default_raw_vars

 default_raw_vars ()

Defines default variables used in get_raw_data if none are specified.

print(default_raw_vars())

['datadate', 'gvkey', 'cik', 'cusip', 'fyearq', 'fqtr', 'fyr', 'sic', 'naics', 'exchg', 'rdq', 'fic', 'atq', 'req', 'xrdq', 'cheq', 'saleq', 'revtq', 'dpq', 'ibq', 'cshoq', 'ceqq', 'seqq', 'txdiq', 'ltq', 'txditcq', 'pstkq', 'pstkrq', 'lctq', 'actq', 'piq', 'niq', 'cshprq', 'epsfxq', 'opepsq', 'epsfiq', 'epspiq', 'epspxq', 'dlttq', 'dlcq', 'txtq', 'xintq', 'ppegtq', 'ppentq', 'rectq', 'invtq', 'cogsq', 'xsgaq', 'ajexq', 'prccq', 'capxy', 'oancfy', 'sstky', 'prstkcy', 'dltisy', 'dltry', 'dvpq', 'dvy', 'sppey', 'aqcy', 'fopty', 'scstkcy', 'wcapq', 'oibdpq', 'tstkq', 'apdedateq', 'fdateq', 'cdvcy', 'cheq', 'intanq', 'gdwlq', 'mibq', 'oiadpq', 'ivaoq', 'npq', 'rectrq']

source

parse_varlist

 parse_varlist (vars:List[str]=None, required_vars=[])

Figures out which vars come from the {LIBRARY}.{TABLE} table and which come from the {LIBRARY}.{COMPANY_TABLE} table and adds a. and b. prefixes to variable names to feed into an SQL query

parse_varlist(['atq','ltq','sic'])

'b.sic,a.ltq,a.atq'

source

get_raw_data

 get_raw_data (vars:List[str]=None, required_vars:List[str]=['gvkey',
               'datadate', 'fyearq', 'fqtr', 'rdq'], nrows:int=None,
               start_date:str=None, end_date:str=None)

Downloads vars from start_date to end_date from WRDS {LIBRARY}.{TABLE} and {LIBRARY}.{COMPANY_TABLE}. It also adds sich and naicsh from the annual table (comp.funda)

	Type	Default	Details
vars	List[str]	None	If None, downloads `default_raw_vars`; use ’*’ to get all available variables
required_vars	List[str]	[‘gvkey’, ‘datadate’, ‘fyearq’, ‘fqtr’, ‘rdq’]	list of variables that will get downloaded, even if not in `vars`
nrows	int	None	Number of rows to download. If None, full dataset will be downloaded
start_date	str	None	Start date in MM/DD/YYYY format
end_date	str	None	End date in MM/DD/YYYY format
Returns	pd.DataFrame

The get_raw_data function will produce unique gvkey-datadate records, which is not the case for the data we would get from the WRDS website, because they use indfmt IN ('INDL','FS') (while we use indfmt='INDL').

m = get_raw_data(['atq','indfmt','sic'],nrows=1)
m

	gvkey	datadate	fyearq	fqtr	rdq	indfmt	sic	atq	sich	naicsh
0	001000	1966-03-31	1966.0	1.0	None	INDL	3089	None	None	None

r = get_raw_data(vars='*', nrows=1000)
r.head(1)

	gvkey	datadate	fyearq	fqtr	fyr	indfmt	consol	popsrc	datafmt	tic	...	spcindcd	spcseccd	spcsrc	state	stko	weburl	dldte	ipodate	sich	naicsh
0	001345	1962-04-30	1961.0	4.0	4.0	INDL	C	D	STD	3ALRM	...	440.0	978.0	None	GA	0.0	None	1980-06-30	None	None	NaN

1 rows × 683 columns

raw = get_raw_data(start_date='01/01/2021', end_date='01/01/2023')

raw.head(0)

	gvkey	datadate	fyearq	fqtr	rdq	naics	sstky	seqq	actq	saleq	...	epsfxq	fdateq	cdvcy	wcapq	oiadpq	txdiq	gdwlq	req	sich	naicsh

0 rows × 77 columns

source

process_raw_data

 process_raw_data (df:pandas.core.frame.DataFrame=None,
                   clean_kwargs:dict={})

Drops duplicage, cleans up dates and applies pandasmore.setup_panel to df

	Type	Default	Details
df	pd.DataFrame	None	Must contain `permno` and `datadate` columns
clean_kwargs	dict	{}	Params to pass to `pdm.setup_panel` other than `panel_ids`, `time_var`, and `freq`
Returns	pd.DataFrame

df_clean = process_raw_data(raw)

print(df_clean.shape)
df_clean.head(1)

(95932, 79)

		datadate	dtdate	dtdate_fiscal	Qdate_fiscal	fyearq	fqtr	rdq	naics	sstky	seqq	...	epsfxq	fdateq	cdvcy	wcapq	oiadpq	txdiq	gdwlq	req	sich	naicsh
gvkey	Qdate
001004	2021Q1	2021-02-28	2021-02-28	2020-09-30	2020Q3	2020.0	3.0	2021-03-23	423860	NaN	932.4	...	0.87	2021-04-05	NaN	640.7	19.5	NaN	NaN	685.3	5080	423860

1 rows × 79 columns

source

ytd_to_quarterly

 ytd_to_quarterly (df:pandas.core.frame.DataFrame=None,
                   vars:List[str]=['capxy', 'oancfy', 'sstky', 'prstkcy',
                   'dltisy', 'dltry', 'dvy', 'sppey', 'aqcy', 'fopty',
                   'scstkcy'], suffix:str='_q')

Convert YTD variables to quarterly variables by taking the difference between the current and previous quarter.

	Type	Default	Details
df	pd.DataFrame	None
vars	List[str]	[‘capxy’, ‘oancfy’, ‘sstky’, ‘prstkcy’, ‘dltisy’, ‘dltry’, ‘dvy’, ‘sppey’, ‘aqcy’, ‘fopty’, ‘scstkcy’]
suffix	str	_q	Suffix to add to the new quarterly variables
Returns	pd.DataFrame

q = ytd_to_quarterly(df_clean)
q.head(0)

		capxy_q	oancfy_q	sstky_q	prstkcy_q	dltisy_q	dltry_q	dvy_q	sppey_q	aqcy_q	fopty_q	scstkcy_q
gvkey	Qdate

source

features

 features (df:pandas.core.frame.DataFrame=None)

Computes a set of features from df

ftrs = features(df_clean)

ftrs.head(1)

		capxy_q	oancfy_q	sstky_q	prstkcy_q	dltisy_q	dltry_q	dvy_q	sppey_q	aqcy_q	fopty_q	...	cflow_is	cflow_cfs	cflow_full	cflow_is_2la	cflow_cfs_2la	cflow_full_2la	cash_2a	booklev	dividends_2la	repurchases_2la
gvkey	Qdate
001004	2021Q1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	40.0	NaN	NaN	NaN	NaN	NaN	0.068542	NaN	NaN	NaN

1 rows × 44 columns