数据挖掘—主成分分析法降维和最小最大规范化
作者:互联网
- 算法步骤:
- 1)将原始数据按列组成n行m列矩阵X
- 2)特征中心化。即每一维的数据都减去该维的均值,使每一维的均值都为0
- 3)求出协方差矩阵
- 4)求出协方差矩阵的特征值及对应的特征向量
- 5)将特征向量按对应的特征值大小从上往下按行排列成矩阵,取前k行组成矩阵p
- 6)Y=PX 即为降维到k维后的数据
PCA
/*
* 算法步骤:
* 1)将原始数据按列组成n行m列矩阵X
* 2)特征中心化。即每一维的数据都减去该维的均值,使每一维的均值都为0
* 3)求出协方差矩阵
* 4)求出协方差矩阵的特征值及对应的特征向量
* 5)将特征向量按对应的特征值大小从上往下按行排列成矩阵,取前k行组成矩阵p
* 6)Y=PX 即为降维到k维后的数据
*/
public class PCA {
public static DenseMatrix64F runPCA(DenseMatrix64F src,int k) {
DenseMatrix64F rs = new DenseMatrix64F(src.numRows,k);
//计算输入矩阵每个元素和特征值平均的差值矩阵
DenseMatrix64F norm_X = new DenseMatrix64F(src.numRows,src.numCols);
for(int i =0;i<src.numCols;i++) {
double tmp=0;
for(int j=0;j<src.numRows;j++) {
tmp+=src.get(j, i);
}
tmp /=src.numRows;
for(int j=0;j<src.numRows;j++) {
norm_X.set(j,i, src.get(j, i)-tmp);
}
}
//计算协方差矩阵
DenseMatrix64F norm_X_T = new DenseMatrix64F(src.numCols,src.numRows);
CommonOps.transpose(norm_X, norm_X_T);
DenseMatrix64F scatter_matrix = new DenseMatrix64F(src.numCols,src.numCols);
CommonOps.mult(norm_X_T,norm_X,scatter_matrix);
//特征向量分解
EDInfo ed = JacobiCount(new DenseMatrix64F(scatter_matrix),0.001,1000);
//选取前k个特征
DenseMatrix64F feature = new DenseMatrix64F(k,src.numCols);
for(int i=0;i<k;i++) {
for(int j=0;j<src.numCols;j++) {
feature.set(i, j, ed.getValues().get(j, i));
}
}
DenseMatrix64F feature_T = new DenseMatrix64F(src.numCols,k);
CommonOps.transpose(feature, feature_T);
CommonOps.mult(norm_X,feature_T,rs);
return rs;
}
public static EDInfo JacobiCount(DenseMatrix64F src, double diff, int iter) {
DenseMatrix64F values = new DenseMatrix64F(src.numRows,src.numCols);
for(int i=0;i<src.numRows;i++) {
for(int j=0;j<src.numCols;j++) {
if(i == j) {
values.set(i, j, 1);
}else {
values.set(i, j, 0);
}
}
}
int nCount = 0;
while(true)
{
double dbMax = Double.MIN_VALUE;
int nRow = 0;
int nCol = 1;
for(int i=0;i<src.numRows;i++) {
for(int j=0;j<src.numCols;j++) {
if(i != j && Math.abs(src.get(i, j)) > dbMax) {
dbMax = Math.abs(src.get(i, j));
nRow = i;
nCol = j;
}
}
}
if(dbMax < diff)
break;
if(nCount > iter)
break;
nCount++;
double dbApp = src.get(nRow, nRow);
double dbApq = src.get(nRow, nCol);
double dbAqq = src.get(nCol, nCol);
double dbAngle = 0.5*Math.atan2(-2*dbApq,dbAqq-dbApp);
double dbSinTheta = Math.sin(dbAngle);
double dbCosTheta = Math.cos(dbAngle);
double dbSin2Theta = Math.sin(2*dbAngle);
double dbCos2Theta = Math.cos(2*dbAngle);
src.set(nRow, nRow, dbApp*dbCosTheta*dbCosTheta +
dbAqq*dbSinTheta*dbSinTheta + 2*dbApq*dbCosTheta*dbSinTheta);
src.set(nCol, nCol, dbApp*dbSinTheta*dbSinTheta +
dbAqq*dbCosTheta*dbCosTheta - 2*dbApq*dbCosTheta*dbSinTheta);
src.set(nRow, nCol, 0.5*(dbAqq-dbApp)*dbSin2Theta + dbApq*dbCos2Theta);
src.set(nCol, nRow,src.get(nRow, nCol));
for(int i = 0; i < src.numRows; i ++)
{
if((i!=nCol) && (i!=nRow))
{
dbMax = src.get(i, nRow);
src.set(i, nRow,src.get(i, nCol)*dbSinTheta+dbMax*dbCosTheta);
src.set(i, nCol,src.get(i, nCol)*dbCosTheta-dbMax*dbSinTheta);
}
}
for (int j = 0; j < src.numRows; j ++)
{
if((j!=nCol) && (j!=nRow))
{
dbMax = src.get(nRow, j);
src.set(nRow, j,src.get(nCol, j)*dbSinTheta+dbMax*dbCosTheta);
src.set(nCol, j,src.get(nCol, j)*dbCosTheta-dbMax*dbSinTheta);
}
}
for(int i = 0; i < src.numRows; i ++)
{
dbMax = values.get(i,nRow);
values.set(i, nRow,values.get(i,nCol)*dbSinTheta+dbMax*dbCosTheta);
values.set(i,nCol,values.get(i,nCol)*dbCosTheta-dbMax*dbSinTheta);
}
}
double[] eig = new double[src.numRows];
for(int i=0;i<src.numRows;i++) {
eig[i] = src.get(i, i);
}
int[] sortInx = argsort(eig);
DenseMatrix64F tmpValues = new DenseMatrix64F(src.numRows,src.numCols);
for(int i=0;i<src.numRows;i++) {
for(int j=0;j<src.numRows;j++) {
tmpValues.set(i, j, values.get(j,sortInx[i]));
}
eig[i] = src.get(sortInx[i],sortInx[i]);
}
for(int i = 0; i < src.numRows; i ++)
{
double dSumVec = 0;
for(int j = 0; j < src.numRows; j ++)
dSumVec += tmpValues.get(j, i);
if(dSumVec<0)
{
for(int j = 0;j < src.numRows; j ++)
tmpValues.set(j, i,tmpValues.get(j, i)*-1);
}
}
return new EDInfo(tmpValues,eig);
}
public static int[] argsort(double[] input) {
int[] rs = new int[input.length];
for(int i=0;i<input.length;i++){
rs[i] = i;
}
for(int i=0;i<input.length-1;i++) {
for(int j=i+1;j<input.length;j++) {
if(input[i] < input[j]) {
double tmp = input[i];
int tmpIndex = rs[j];
input[i] = input[j];
input[j] = tmp;
rs[j] = rs[i];
rs[i] = tmpIndex;
}
}
}
return rs;
}
static ArrayList<String> tempc=new ArrayList<>();
public double[][] readData() throws IOException {
double[][] res=new double[78][13];
try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw
/* 读入TXT文件 */
File filename = new File("src/bp/test.txt"); // 要读取以上路径的input。txt文件
InputStreamReader reader = new InputStreamReader(
new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = "";
line = br.readLine();
int j=0;
while (line != null) {
String[] temp=line.split(",");
for(int i=0;i<13;i++)
{
res[j][i]=Double.parseDouble(temp[i]);
System.out.print( res[j][i]+" ");
}
tempc.add(temp[13]);
System.out.println();
j++;
line = br.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
return res;
}
public static void writeTxt( DenseMatrix64F denseMatrix64F){
try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw
/* 读入TXT文件 */
StringBuilder stringBuilder=new StringBuilder();
for(int i=0;i<denseMatrix64F.numRows;i++)
{
for(int j=0;j<denseMatrix64F.numCols;j++)
stringBuilder.append(denseMatrix64F.get(i,j)).append(',');
stringBuilder.append(tempc.get(i)).append("\n");
}
File writename = new File("src/bp/test(low).txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
writename.createNewFile(); // 创建新文件
BufferedWriter out = new BufferedWriter(new FileWriter(writename));
out.write(stringBuilder.toString()); // \r\n即为换行
out.flush(); // 把缓存区内容压入文件
out.close(); // 最后记得关闭文件
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) throws IOException {
// TODO Auto-generated catch block
PCA pca = new PCA();
//获得样本集
double[][] primaryArray =pca.readData() ;
System.out.println();
DenseMatrix64F denseMatrix64F=runPCA(new DenseMatrix64F(primaryArray),10);
writeTxt(denseMatrix64F);
}
}
最小最大规范化
public class DealData {
static double[] max=new double[14];
static double[] min=new double[14];static ArrayList<String> list1=new ArrayList<>();
public static List<List<Double>> readTxt(String fileName){
List<List<Double>> list=new ArrayList<>();
Arrays.fill(max,Integer.MIN_VALUE);
Arrays.fill(min,Integer.MAX_VALUE);
try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw
/* 读入TXT文件 */
File filename = new File(fileName); // 要读取以上路径的input。txt文件
InputStreamReader reader = new InputStreamReader(
new FileInputStream(filename)); // 建立一个输入流对象reader
BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
String line = "";
line = br.readLine();
while (line != null) {
if(line.length()>0){
String[] temp=line.split(",");
ArrayList<Double> strings=new ArrayList<>();
for(int i=0;i<temp.length-1;i++)
{
strings.add(Double.parseDouble(temp[i]));
max[i]=Math.max(Double.parseDouble(temp[i]),max[i]);
min[i]=Math.min(Double.parseDouble(temp[i]),min[i]);
}
list1.add(temp[temp.length-1]);
list.add(strings);}
line = br.readLine();
}
} catch (Exception e) {
e.printStackTrace();
}
return list;
}
public static void writeTxt(String content){
try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw
/* 读入TXT文件 */
File writename = new File("src/bp/trainBayes.txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
writename.createNewFile(); // 创建新文件
BufferedWriter out = new BufferedWriter(new FileWriter(writename));
out.write(content); // \r\n即为换行
out.flush(); // 把缓存区内容压入文件
out.close(); // 最后记得关闭文件
} catch (Exception e) {
e.printStackTrace();
}
}
public static void main(String[] args) {
List<List<Double>> list= readTxt("src/bp/train(low).txt");
StringBuilder stringBuilder=new StringBuilder();
for(int i=0;i<list.size();i++)
{
for(int j=0;j<list.get(i).size()-1;j++){
double gap=Math.ceil((max[j]-min[j])/8);
stringBuilder.append(Math.round((list.get(i).get(j)-min[j])/gap)).append(',');
}
stringBuilder.append(list1.get(i));
stringBuilder.append("\n");
}
writeTxt(stringBuilder.toString());
}
}
标签:src,get,int,double,分析法,维和,DenseMatrix64F,数据挖掘,new 来源: https://blog.csdn.net/weixin_44560620/article/details/113432483