背景
在 Compustat 数据库里,企业的总部所在地(州)并不是历史记录,而是当前的数据(储存在comp.company
里。也就是说,若一个公司调整了其总部坐在地(或者其他信息),所有的历史记录都会被更新。这样一来如果要在模型里加入总部所在地的固定效应,或者要去匹配当地的经济/人口信息时,就会有误差。
要解决这个问题,通行的办法是利用企业的历史SEC 10K/Q filings来提取历史总部所在地。我的博客 Textual Analysis on SEC Filings 给出了从头开始做这件事的步骤和代码。如果需要编译好的数据集,University of Notre Dame 的 Software Repository for Accounting and Finance 提供了一个 augmented 10-X header dataset。
基于这个数据集我快速地算了一些统计数据:每年约有2-3%的Compustat企业调整了他们的总部所在地(州)。在17221个公司里,2947个(17%)至少调整过一次总部所在地。这不是一个可以忽略的小数字。
如何得到企业的历史总部所在地(1969-2018)?
这个问题分两个时间段。1969年到2003年的数据可以使用 Bai, Fairhurst and Serfling (2020 RFS)。作者们在94年之后使用了SEC fillings,94年之前则人工收集于 Moody’s Manuals (later Mergent Manuals) 和 Dun & Bradstreet’s Million Dollar Directory (later bought by Mergent)。在他们 115,432 firm-year 的样本里, 1969年到2003年, 9,847 (87.50%) 从未搬过总部,1,211 (10.76%) 搬过一次,178 (1.58%) 搬过两次,18 (0.16%) 搬过三次。
1994年到2018年的数据我手动从 augmented 10-X header dataset 里提取。首先跑一下这个Python脚本:
import pandas as pd
filepath = "~/Downloads/LM_EDGAR_10X_Header_1994_2018.csv"
if __name__ == "__main__":
df = pd.read_csv(
filepath,
usecols=["cik", "file_date", "ba_state"],
dtype={"cik": str},
parse_dates=["file_date"],
)
# Some `ba_stata` codes are lowercase
df["ba_state"] = df["ba_state"].str.upper()
# Some `ba_state` codes are not valid US states
df = df[df["ba_state"].str.isalpha() & ~pd.isnull(df["ba_state"])]
df.drop_duplicates().to_stata(
"~/Downloads/historical_state_1994_2018.dta",
write_index=False,
convert_dates={"file_date": "td"},
)
然后把导出的STATA数据集上传到WRDS Cloud上,跑一下这个SAS脚本:
libname hs "~/historical_state";
/* Historical HQ state (1994 to 2018) from augmented 10-X header dataset */
proc import datafile="~/historical_state/historical_state_1994_2018.dta"
out=historical_state_1994_2018 dbms=stata replace;
/* Historical HQ state (1969 to 2003) from Bai, Fairhurst and Serfling (2020 RFS) */
proc import datafile="~/historical_state/hist_headquarters_Bai_et_al.dta"
out=hist_headquarters_Bai_et_al dbms=stata replace;
/* Build the post-1994 dataset using SEC filings */
proc sql;
create table funda as
select gvkey, cik, datadate, fyear from comp.funda
where indfmt= 'INDL' and datafmt='STD' and popsrc='D' and consol='C'
and year(datadate) between 1994 and 2018
/* "For firms that change fiscal year within a calendar year,
we take the last reported date when extracting financial data.
This leaves us with one set of observations for each firm (gvkey) in each year."
-- Pelueger, Siriwardane and Sunderam (2020 QJE) */
group by gvkey, fyear having datadate=max(datadate);
create table firm_historical_state as
select a.*, b.ba_state as state_sec label="State from SEC filings"
from funda as a left join historical_state as b
on a.cik=b.cik and year(a.datadate)=year(b.file_date) and b.file_date<=a.datadate
group by a.gvkey, a.datadate
/* use the SEC filing closet to and before the Compustat datadate */
having b.file_date=max(b.file_date);
create table historical_state_1994_2018 as
select a.*, b.state as state_comp label="State from Compustat"
from firm_historical_state as a left join comp.company as b
on a.gvkey=b.gvkey
order by a.gvkey, a.datadate;
quit;
/* Sanity check: no duplicated gvkey-fyear */
proc sort data=historical_state_1994_2018 nodupkey; by gvkey datadate; run;
proc sql;
create table hist_headquarters_Bai_et_al as
select put(gvkeyn, z6.) as gvkey, fyear, state
from hist_headquarters_Bai_et_al;
quit;
/* Stack together the two datasets */
data states;
set hist_headquarters_Bai_et_al
historical_state_1994_2018(where=(fyear>2003) keep=gvkey fyear state:);
run;
proc sql;
create table hs.corrected_hist_state_1969_2018 as
select *, coalesce(state, state_sec, state_comp) as corrected_state
from states where not missing(calculated corrected_state)
order by gvkey, fyear;
quit;
/* Sanity check: no duplicated gvkey-fyear */
proc sort data=hs.corrected_hist_state_1969_2018 nodupkey; by gvkey fyear; run;
下载数据
如果你也需要这个数据的话,可以来我的网站下载。
Firm Historical Headquarter State from SEC 10K/Q Filings
来源:知乎 www.zhihu.com
作者:知乎用户(登录查看详情)
【知乎日报】千万用户的选择,做朋友圈里的新鲜事分享大牛。
点击下载