期货因子分析(三)
作者:互联网
目录
回顾
代码
import numpy as np
import pandas as pd
from factor_analyzer import FactorAnalyzer
import matplotlib.pyplot as plt
from typing import TYPE_CHECKING
df = pd.read_csv("bfi.csv")
# Dropping unnecessary columns
df.drop(['Unnamed: 0', 'gender', 'education', 'age'], axis=1, inplace=True)
# Dropping missing values rows
df.dropna(inplace=True)
if TYPE_CHECKING: # A
print(df.head())
if TYPE_CHECKING: # B
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
# Bartlett's test of sphericity 是用来检测观察到的变量之间是否关联, 如果检测结果在统计学上不显著, 就不能采用因子分析.
chi_square_value, p_value = calculate_bartlett_sphericity(df)
print(chi_square_value, p_value) # 18170.966350869257 0.0
# p-value=0, 表明观察到的相关矩阵不是一个identity matrix.
if TYPE_CHECKING: # C
from factor_analyzer.factor_analyzer import calculate_kmo
kmo_all, kmo_model = calculate_kmo(df)
print(kmo_model) # 0.848539722194922
# Value of KMO less than 0.6 is considered inadequate.(就是kmo值要大于0.6)
if TYPE_CHECKING: # D
# Create factor analysis object and perform factor analysis
fa = FactorAnalyzer(25, rotation=None)
fa.fit(df)
# Check Eigenvalues
ev, v = fa.get_eigenvalues()
plt.scatter(range(1, df.shape[1] + 1), ev)
plt.plot(range(1, df.shape[1] + 1), ev)
plt.title('Scree Plot')
plt.xlabel('Factors')
plt.ylabel('Eigenvalue')
plt.grid()
plt.show()
if TYPE_CHECKING: # E
fa = FactorAnalyzer(5, rotation="varimax")
fa.fit(df)
"""
FactorAnalyzer(bounds=(0.005, 1), impute='median', is_corr_matrix=False,
method='minres', n_factors=5, rotation='varimax',
rotation_kwargs={}, use_smc=True)
"""
import seaborn as sns
df_cm = pd.DataFrame(np.abs(fa.loadings_), index=df.columns)
plt.figure(figsize=(14, 14))
ax = sns.heatmap(df_cm, annot=True, cmap="BuPu")
# 设置y轴的字体的大小
ax.yaxis.set_tick_params(labelsize=15)
plt.title('Factor Analysis', fontsize='xx-large')
# Set y-axis label
plt.ylabel('Sepal Width', fontsize='xx-large')
# plt.savefig('factorAnalysis.png', dpi=500)
plt.show(dpi=500)
fa = FactorAnalyzer(5, rotation="varimax")
fa.fit(df)
if TYPE_CHECKING: # F
import seaborn as sns
df_cm = pd.DataFrame(np.abs(fa.loadings_), index=df.columns)
plt.figure(figsize=(14, 14))
ax = sns.heatmap(df_cm, annot=True, cmap="BuPu")
# 设置y轴的字体的大小
ax.yaxis.set_tick_params(labelsize=15)
plt.title('Factor Analysis', fontsize='xx-large')
# Set y-axis label
plt.ylabel('Sepal Width', fontsize='xx-large')
# plt.savefig('factorAnalysis.png', dpi=500)
plt.show(dpi=500)
print(fa.get_factor_variance()) # G
分析(代码块D)
分析因子是否有效的方式一般为特征值>1. or 累计贡献>75%
显然从scree plot 可以看到factors_num<=6
分析代码块(E)
Factor 6 has none of the high loagings for any variable and is not
easily interpretable. Its good if we take only five factors. 所以尝试factors_num=5
分析(代码块G)
总结
简单来说,该因子组合解释了42.36%的累计贡献
如果您看到这篇文章有收获或者有不同的意见,欢迎点赞或者评论。
群:984328985
丁。
标签:plt,df,CHECKING,fa,因子分析,factor,期货,import 来源: https://blog.csdn.net/a5186050/article/details/112954248