10  获取数据:GMD


数据分析的第一步是获取数据。本章将介绍如何从 Global Macro Data 获取全球宏观经济数据。该数据库涵盖了 243 个国家和地区的 46 个宏观经济变量,包括投资占GDP比重(inv_GDP)、出口占GDP 比重(exports_GDP)、政府支出占 GDP 比重(govexp_GDP)等。 有关该数据库的详细介绍参见:GMD:最新全球宏观数据库-243个国家46个宏观变量

我们将详细讲解以下内容:

  1. 数据获取与清洗:如何下载并加载数据,处理缺失值和异常值。
  2. 生成新变量:例如以美元计价的出口数据、出口占比等。
  3. 数据可视化:通过静态和动态图表展示全球宏观经济趋势,包括时间序列图、堆积面积图和动画图表。

通过本章的学习,您将掌握从数据获取到可视化分析的完整流程,为后续的深入研究奠定基础。

10.1 将数据下载到本地

我们需要为本项目设定一个文件夹,将所有数据与脚本统一存放,例如 D:\Github\dslian\body。用 os.chdir() 切过去后,接着通过 pd.read_csv() 直接从网址读取 GMD.csv,约 9 MB,十余秒即可载入为 DataFrame。最后用 to_csv() 保存到本地 data 子目录,并设定 index=False,既留备份,也便于离线复现。

import pandas as pd
import os

os.chdir(r"D:\Github\dslian\body")  # 修改为你的工作路径

# 获取数据并加载到 DataFrame
''' 9M 左右,下载需要 10-15 秒
url = "https://www.globalmacrodata.com/GMD.csv"
data = pd.read_csv(url)

# 保存数据到 data 文件夹下
data.to_csv("data/GMD.csv", index=False)
'''

# 从 data 文件夹读取数据
data = pd.read_csv("data/GMD.csv")

# 查看前几行数据
print(data.head())
print(data.columns)
  countryname ISO3  year  nGDP  rGDP  rGDP_pc  rGDP_USD  deflator  cons  \
0       Aruba  ABW  1950   NaN   NaN      NaN       NaN       NaN   NaN   
1       Aruba  ABW  1951   NaN   NaN      NaN       NaN       NaN   NaN   
2       Aruba  ABW  1952   NaN   NaN      NaN       NaN       NaN   NaN   
3       Aruba  ABW  1953   NaN   NaN      NaN       NaN       NaN   NaN   
4       Aruba  ABW  1954   NaN   NaN      NaN       NaN       NaN   NaN   

   rcons  ...  ltrate  cbrate  M0  M1  M2  M3  M4  SovDebtCrisis  \
0    NaN  ...     NaN     NaN NaN NaN NaN NaN NaN            NaN   
1    NaN  ...     NaN     NaN NaN NaN NaN NaN NaN            NaN   
2    NaN  ...     NaN     NaN NaN NaN NaN NaN NaN            NaN   
3    NaN  ...     NaN     NaN NaN NaN NaN NaN NaN            NaN   
4    NaN  ...     NaN     NaN NaN NaN NaN NaN NaN            NaN   

   CurrencyCrisis  BankingCrisis  
0             NaN            NaN  
1             NaN            NaN  
2             NaN            NaN  
3             NaN            NaN  
4             NaN            NaN  

[5 rows x 49 columns]
Index(['countryname', 'ISO3', 'year', 'nGDP', 'rGDP', 'rGDP_pc', 'rGDP_USD',
       'deflator', 'cons', 'rcons', 'cons_GDP', 'inv', 'inv_GDP', 'finv',
       'finv_GDP', 'exports', 'exports_GDP', 'imports', 'imports_GDP', 'CA',
       'CA_GDP', 'USDfx', 'REER', 'govexp', 'govexp_GDP', 'govrev',
       'govrev_GDP', 'govtax', 'govtax_GDP', 'govdef', 'govdef_GDP', 'govdebt',
       'govdebt_GDP', 'HPI', 'CPI', 'infl', 'pop', 'unemp', 'strate', 'ltrate',
       'cbrate', 'M0', 'M1', 'M2', 'M3', 'M4', 'SovDebtCrisis',
       'CurrencyCrisis', 'BankingCrisis'],
      dtype='object')
import matplotlib.pyplot as plt

# 选择变量列表和国家
vlist = ["inv_GDP", "exports_GDP", "imports_GDP", "govexp_GDP"]
cname = "CHN"

# 筛选出指定国家的数据
china_data = data[data["ISO3"] == cname]

# 选择样本范围
china_data = china_data[(china_data["year"] >= 1978) & (china_data["year"] <= 2024)]

# 绘制简单的时序图
plt.figure(figsize=(10, 6))
plt.plot(china_data["year"], china_data["inv_GDP"], label="inv_GDP")
plt.xlabel("Year")
plt.ylabel("Value")
plt.title("Time Series of inv_GDP")
plt.legend()
plt.grid()
plt.show()

# 绘制多变量时序图
plt.figure(figsize=(10, 7))
for var in vlist:
    plt.plot(china_data["year"], china_data[var], label=var)
plt.xlabel("Year")
plt.ylabel("Value")
plt.title("Time Series of Multiple Variables")
plt.xticks(range(1980, 2026, 5))
plt.legend(loc="upper left", bbox_to_anchor=(1, 1), ncol=1)
plt.tight_layout()
plt.show()

# 定义变量列表与国家代码
data = pd.read_csv("data/GMD.csv")

start_year = 1900
end_year = 2024

vlist = ["exports", "USDfx"]
cname = ["USA", "DEU", "FRA", "GBR", "JPN", "CHN"]

# 创建以美元计价的出口数据
data["exports_USD"] = data["exports"] / data["USDfx"]

# 删除数据质量有问题的国家
invalid_countries = ["MMR", "SLE", "ROU", "ZWE", "POL", "YUG"]
data = data[~data["ISO3"].isin(invalid_countries)]

# 保留必要变量并选择样本范围
data = data[["ISO3", "year", "exports_USD"]].dropna()
data = data[(data["year"] >= start_year) & (data["year"] <= end_year)]

# 计算每年的全球总出口
data["total_exports"] = data.groupby("year")["exports_USD"].transform("sum")

# 计算每个国家的出口占比
data["export_share"] = (data["exports_USD"] / data["total_exports"]) * 100

# 保留指定国家以及其他所有国家
data = data[data["ISO3"].isin(cname) | (data["ISO3"] != "")]

# 计算指定国家的出口总和与 ROW(其他所有国家)的出口总和
data["selected_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if data["ISO3"].isin(cname).any() else 0
)
data["ROW_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if ~data["ISO3"].isin(cname).any() else 0
)

# 创建排名变量并标识 ROW 与各个国家
rank_map = {"USA": 1, "FRA": 2, "GBR": 3, "JPN": 4, "CHN": 5, "DEU": 6}
data["rank"] = data["ISO3"].map(rank_map)
data.loc[~data["ISO3"].isin(cname), "ISO3"] = "ROW"
data.loc[data["ISO3"] == "ROW", "export_share"] = data["ROW_sum"]
data = data.drop_duplicates(subset=["ISO3", "year"]).sort_values(by=["year", "rank"])
data["cum_share"] = data.groupby("year")["export_share"].cumsum()

# 绘制图表
plt.figure(figsize=(10, 7))
bright_colors = {
    "ROW": "#33FF00",    # lime green
    "DEU": "#0072B2",    # blue
    "CHN": "#FF0000",    # red
    "JPN": "#CC79A7",    # purple pink
    "GBR": "#E69F00",    # orange
    "FRA": "#56B4E9",    # sky blue
    "USA": "#000000",    # black
}
for country in ["ROW", "DEU", "CHN", "JPN", "GBR", "FRA", "USA"]:
    subset = data[data["ISO3"] == country]
    plt.fill_between(subset["year"], subset["cum_share"], label=country, color=bright_colors[country], alpha=0.8)

plt.xlabel("Year")
plt.ylabel("Share of Global Exports (%)")
plt.title("Share of Global Exports by Country")

plt.ylim(0, 100)  # 修改为 100
plt.grid(axis="y", linestyle="--", linewidth=0.5, alpha=0.7)

# 图例美化:右上角内嵌透明背景
plt.legend(title="Country", loc="upper right", frameon=True, framealpha=0.9, facecolor="white")
plt.tight_layout()
plt.show()

10.1.1 更好的版本

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# 读取数据
data = pd.read_csv("data/GMD.csv")

# 定义年份范围与国家代码
start_year = 1900
end_year = 2024
cname = ["USA", "DEU", "FRA", "GBR", "JPN", "CHN"]

# 创建以美元计价的出口数据
data["exports_USD"] = data["exports"] / data["USDfx"]

# 删除数据质量较差的国家
invalid_countries = ["MMR", "SLE", "ROU", "ZWE", "POL", "YUG"]
data = data[~data["ISO3"].isin(invalid_countries)]

# 保留必要变量并选择样本区间
data = data[["ISO3", "year", "exports_USD"]].dropna()
data = data[(data["year"] >= start_year) & (data["year"] <= end_year)]

# 计算每年全球总出口额
data["total_exports"] = data.groupby("year")["exports_USD"].transform("sum")

# 计算每个国家出口占比
data["export_share"] = (data["exports_USD"] / data["total_exports"]) * 100

# 保留目标国家与其他国家
data = data[data["ISO3"].isin(cname) | (data["ISO3"] != "")]

# 计算其他国家(ROW)的出口占比
data["selected_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if data["ISO3"].isin(cname).any() else 0
)
data["ROW_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if ~data["ISO3"].isin(cname).any() else 0
)

# 创建排名变量并标注 ROW 与各国家
rank_map = {"USA": 1, "FRA": 2, "GBR": 3, "JPN": 4, "CHN": 5, "DEU": 6}
data["rank"] = data["ISO3"].map(rank_map)
data.loc[~data["ISO3"].isin(cname), "ISO3"] = "ROW"
data.loc[data["ISO3"] == "ROW", "export_share"] = data["ROW_sum"]

# 保留唯一值并排序,计算累计占比
data = data.drop_duplicates(subset=["ISO3", "year"]).sort_values(by=["year", "rank"])
data["cum_share"] = data.groupby("year")["export_share"].cumsum()

# 设置图形样式
mpl.rcParams.update({
    "font.size": 12,
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "legend.fontsize": 11,
    "xtick.labelsize": 11,
    "ytick.labelsize": 11,
    "figure.dpi": 100,
    "figure.facecolor": "white",
    "axes.facecolor": "white",
    "axes.edgecolor": "black",
    "axes.grid": True,
    "grid.linestyle": "--",
    "grid.alpha": 0.5
})

# 定义配色方案(对比度高)
bright_colors = {
    "ROW": "#33FF00",    # lime green
    "DEU": "#0072B2",    # blue
    "CHN": "#FF0000",    # red
    "JPN": "#CC79A7",    # purple pink
    "GBR": "#E69F00",    # orange
    "FRA": "#56B4E9",    # sky blue
    "USA": "#000000",    # black
}

# 定义图例名称映射(替换 ISO3 为更易懂的国家名)
label_map = {
    "ROW": "Rest of World",
    "USA": "United States",
    "DEU": "Germany",
    "FRA": "France",
    "GBR": "United Kingdom",
    "JPN": "Japan",
    "CHN": "China"
}

# 绘制图形
plt.figure(figsize=(10, 6))
for country in ["ROW", "DEU", "CHN", "JPN", "GBR", "FRA", "USA"]:
    subset = data[data["ISO3"] == country]
    plt.fill_between(subset["year"], subset["cum_share"],
                     label=label_map[country],
                     color=bright_colors[country],
                     alpha=0.85)

# 设置标题、标签和图例
plt.xlabel("Year")
plt.ylabel("Share of Global Exports (%)")
plt.title("Share of Global Exports by Country", pad=12)
plt.ylim(0, 100)
plt.xlim(start_year, end_year)

# 图例设置:右上角、白色背景、透明框
plt.legend(title="Country", loc="upper right", frameon=True, framealpha=0.9, facecolor="white")
plt.tight_layout()
plt.show()

10.1.2 动态图形

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.animation as animation

# 下载数据
# url = "https://www.globalmacrodata.com/GMD.csv"
# data = pd.read_csv(url)

# 读取数据
data = pd.read_csv("data/GMD.csv")

# 参数设置
start_year = 1900
end_year = 2024
year_interval = 2  # 动画间隔
years = list(range(start_year, end_year + 1, year_interval))
cname = ["USA", "DEU", "FRA", "GBR", "JPN", "CHN"]

# 数据处理
# 将出口数据转换为以美元计价
data["exports_USD"] = data["exports"] / data["USDfx"]

# 删除数据质量较差的国家
data = data[~data["ISO3"].isin(["MMR", "SLE", "ROU", "ZWE", "POL", "YUG"])]

# 保留必要的变量并删除缺失值
data = data[["ISO3", "year", "exports_USD"]].dropna()

# 筛选出指定年份范围内的数据
data = data[(data["year"] >= start_year) & (data["year"] <= end_year)]

# 计算每年全球总出口额
data["total_exports"] = data.groupby("year")["exports_USD"].transform("sum")

# 计算每个国家的出口占比
data["export_share"] = data["exports_USD"] / data["total_exports"] * 100

# 保留目标国家和其他国家的数据
data = data[data["ISO3"].isin(cname) | (data["ISO3"] != "")]

# 计算目标国家的出口占比总和
data["selected_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if data["ISO3"].isin(cname).any() else 0
)

# 计算其他国家(ROW)的出口占比总和
data["ROW_sum"] = data.groupby("year")["export_share"].transform(
    lambda x: x if ~data["ISO3"].isin(cname).any() else 0
)

# 将非目标国家标记为 "ROW" 并更新其出口占比
data.loc[~data["ISO3"].isin(cname), "ISO3"] = "ROW"
data.loc[data["ISO3"] == "ROW", "export_share"] = data["ROW_sum"]

# 删除重复值,确保每个国家每年只有一条记录
data = data.drop_duplicates(subset=["ISO3", "year"])

# 设置样式
mpl.rcParams.update({
    "font.size": 11,
    "axes.titlesize": 15,
    "axes.labelsize": 12,
    "figure.dpi": 100,
    "figure.facecolor": "white",
    "axes.grid": True,
    "grid.linestyle": "--",
    "grid.alpha": 0.5
})

# 颜色与国家名映射
bright_colors = {
    "ROW": "#33FF00",
    "DEU": "#0072B2",
    "CHN": "#FF0000",
    "JPN": "#CC79A7",
    "GBR": "#E69F00",
    "FRA": "#56B4E9",
    "USA": "#000000",
}
label_map = {
    "ROW": "Others",
    "USA": "USA",
    "DEU": "Germany",
    "FRA": "France",
    "GBR": "UK",
    "JPN": "Japan",
    "CHN": "China"
}

# 创建动画
fig, ax = plt.subplots(figsize=(6, 6))

def update(year):
    ax.clear()
    year_data = data[data["year"] == year].copy()
    year_data = year_data.groupby("ISO3")["export_share"].sum().reset_index()
    year_data = year_data.sort_values("export_share", ascending=True)
    
    bars = ax.barh(
        [label_map[c] for c in year_data["ISO3"]],
        year_data["export_share"],
        color=[bright_colors.get(c, "#999999") for c in year_data["ISO3"]],
        alpha=0.85
    )
    
    for bar, val in zip(bars, year_data["export_share"]):
        ax.text(val + 0.5, bar.get_y() + bar.get_height()/2,
                f"{val:.1f}%", va='center', fontsize=10)

    ax.set_xlim(0, 60)
    ax.set_title(f"Global Export Share by Country - {year}")
    ax.set_xlabel("Export Share (%)")
    ax.set_ylabel("Country")
    ax.grid(True, axis="x", linestyle="--", alpha=0.5)

ani = animation.FuncAnimation(fig, update, frames=years, interval=600, repeat=False)

# 保存动画(也可改为 .mp4 需要 ffmpeg)
ani.save("figs/export_share_animation.gif", writer="pillow", dpi=120)

# 显示 GIF 动图
from IPython.display import Image
Image(filename="figs/export_share_animation.gif")
<IPython.core.display.Image object>