其他分享
首页 > 其他分享> > 数据挖掘—主成分分析法降维和最小最大规范化

数据挖掘—主成分分析法降维和最小最大规范化

作者:互联网

PCA


/*
 * 算法步骤:
 * 1)将原始数据按列组成n行m列矩阵X
 * 2)特征中心化。即每一维的数据都减去该维的均值,使每一维的均值都为0
 * 3)求出协方差矩阵
 * 4)求出协方差矩阵的特征值及对应的特征向量
 * 5)将特征向量按对应的特征值大小从上往下按行排列成矩阵,取前k行组成矩阵p
 * 6)Y=PX 即为降维到k维后的数据
 */
public class PCA {
    public static DenseMatrix64F runPCA(DenseMatrix64F src,int k) {

        DenseMatrix64F rs = new DenseMatrix64F(src.numRows,k);

        //计算输入矩阵每个元素和特征值平均的差值矩阵
        DenseMatrix64F norm_X = new DenseMatrix64F(src.numRows,src.numCols);

        for(int i =0;i<src.numCols;i++) {
            double tmp=0;
            for(int j=0;j<src.numRows;j++) {
                tmp+=src.get(j, i);
            }
            tmp /=src.numRows;
            for(int j=0;j<src.numRows;j++) {
                norm_X.set(j,i, src.get(j, i)-tmp);
            }
        }


        //计算协方差矩阵
        DenseMatrix64F norm_X_T = new DenseMatrix64F(src.numCols,src.numRows);

        CommonOps.transpose(norm_X, norm_X_T);

        DenseMatrix64F scatter_matrix = new DenseMatrix64F(src.numCols,src.numCols);

        CommonOps.mult(norm_X_T,norm_X,scatter_matrix);


        //特征向量分解
        EDInfo ed = JacobiCount(new DenseMatrix64F(scatter_matrix),0.001,1000);
        //选取前k个特征
        DenseMatrix64F feature = new DenseMatrix64F(k,src.numCols);

        for(int i=0;i<k;i++) {
            for(int j=0;j<src.numCols;j++) {
                feature.set(i, j, ed.getValues().get(j, i));
            }
        }


        DenseMatrix64F feature_T = new DenseMatrix64F(src.numCols,k);

        CommonOps.transpose(feature, feature_T);


        CommonOps.mult(norm_X,feature_T,rs);

        return rs;
    }
    public static EDInfo JacobiCount(DenseMatrix64F src, double diff, int iter) {

        DenseMatrix64F values = new DenseMatrix64F(src.numRows,src.numCols);

        for(int i=0;i<src.numRows;i++) {
            for(int j=0;j<src.numCols;j++) {
                if(i == j) {
                    values.set(i, j, 1);
                }else {
                    values.set(i, j, 0);
                }
            }
        }

        int nCount = 0;

        while(true)
        {

            double dbMax = Double.MIN_VALUE;
            int nRow = 0;
            int nCol = 1;

            for(int i=0;i<src.numRows;i++) {
                for(int j=0;j<src.numCols;j++) {
                    if(i != j && Math.abs(src.get(i, j)) > dbMax) {
                        dbMax = Math.abs(src.get(i, j));
                        nRow = i;
                        nCol = j;
                    }
                }
            }

            if(dbMax < diff)
                break;

            if(nCount > iter)
                break;

            nCount++;

            double dbApp = src.get(nRow, nRow);
            double dbApq = src.get(nRow, nCol);
            double dbAqq = src.get(nCol, nCol);

            double dbAngle = 0.5*Math.atan2(-2*dbApq,dbAqq-dbApp);
            double dbSinTheta = Math.sin(dbAngle);
            double dbCosTheta = Math.cos(dbAngle);
            double dbSin2Theta = Math.sin(2*dbAngle);
            double dbCos2Theta = Math.cos(2*dbAngle);



            src.set(nRow, nRow, dbApp*dbCosTheta*dbCosTheta +
                    dbAqq*dbSinTheta*dbSinTheta + 2*dbApq*dbCosTheta*dbSinTheta);
            src.set(nCol, nCol, dbApp*dbSinTheta*dbSinTheta +
                    dbAqq*dbCosTheta*dbCosTheta - 2*dbApq*dbCosTheta*dbSinTheta);
            src.set(nRow, nCol, 0.5*(dbAqq-dbApp)*dbSin2Theta + dbApq*dbCos2Theta);
            src.set(nCol, nRow,src.get(nRow, nCol));

            for(int i = 0; i < src.numRows; i ++)
            {
                if((i!=nCol) && (i!=nRow))
                {
                    dbMax = src.get(i, nRow);
                    src.set(i, nRow,src.get(i, nCol)*dbSinTheta+dbMax*dbCosTheta);
                    src.set(i, nCol,src.get(i, nCol)*dbCosTheta-dbMax*dbSinTheta);
                }
            }



            for (int j = 0; j < src.numRows; j ++)
            {
                if((j!=nCol) && (j!=nRow))
                {
                    dbMax = src.get(nRow, j);
                    src.set(nRow, j,src.get(nCol, j)*dbSinTheta+dbMax*dbCosTheta);
                    src.set(nCol, j,src.get(nCol, j)*dbCosTheta-dbMax*dbSinTheta);
                }
            }



            for(int i = 0; i < src.numRows; i ++)
            {
                dbMax = values.get(i,nRow);
                values.set(i, nRow,values.get(i,nCol)*dbSinTheta+dbMax*dbCosTheta);
                values.set(i,nCol,values.get(i,nCol)*dbCosTheta-dbMax*dbSinTheta);
            }


        }



        double[] eig = new double[src.numRows];


        for(int i=0;i<src.numRows;i++) {
            eig[i] = src.get(i, i);
        }



        int[] sortInx = argsort(eig);

        DenseMatrix64F tmpValues = new DenseMatrix64F(src.numRows,src.numCols);

        for(int i=0;i<src.numRows;i++) {

            for(int j=0;j<src.numRows;j++) {

                tmpValues.set(i, j, values.get(j,sortInx[i]));
            }

            eig[i] = src.get(sortInx[i],sortInx[i]);

        }


        for(int i = 0; i < src.numRows; i ++)
        {
            double dSumVec = 0;
            for(int j = 0; j < src.numRows; j ++)
                dSumVec += tmpValues.get(j, i);
            if(dSumVec<0)
            {
                for(int j = 0;j < src.numRows; j ++)
                    tmpValues.set(j, i,tmpValues.get(j, i)*-1);
            }
        }

        return new EDInfo(tmpValues,eig);

    }
    public static int[] argsort(double[] input) {

        int[] rs =  new int[input.length];

        for(int i=0;i<input.length;i++){
            rs[i] = i;
        }

        for(int i=0;i<input.length-1;i++) {
            for(int j=i+1;j<input.length;j++) {
                if(input[i] < input[j]) {

                    double tmp = input[i];
                    int tmpIndex = rs[j];

                    input[i] = input[j];
                    input[j] = tmp;

                    rs[j] = rs[i];
                    rs[i] = tmpIndex;

                }
            }
        }

        return rs;
    }
    static ArrayList<String> tempc=new ArrayList<>();
    public  double[][] readData() throws IOException {
        double[][] res=new double[78][13];
        try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw

            /* 读入TXT文件 */

            File filename = new File("src/bp/test.txt"); // 要读取以上路径的input。txt文件
            InputStreamReader reader = new InputStreamReader(
                    new FileInputStream(filename)); // 建立一个输入流对象reader
            BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
            String line = "";
            line = br.readLine();
            int j=0;
            while (line != null) {

                String[] temp=line.split(",");
                for(int i=0;i<13;i++)
                {
                    res[j][i]=Double.parseDouble(temp[i]);
                    System.out.print( res[j][i]+" ");
                }
                tempc.add(temp[13]);
                System.out.println();
                j++;
                line = br.readLine();
            }

        } catch (Exception e) {
            e.printStackTrace();
        }
        return res;
    }
    public static void writeTxt( DenseMatrix64F denseMatrix64F){
        try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw

            /* 读入TXT文件 */

            StringBuilder stringBuilder=new StringBuilder();
            for(int i=0;i<denseMatrix64F.numRows;i++)
            {
                for(int j=0;j<denseMatrix64F.numCols;j++)
               stringBuilder.append(denseMatrix64F.get(i,j)).append(',');
               stringBuilder.append(tempc.get(i)).append("\n");
            }
            File writename = new File("src/bp/test(low).txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
            writename.createNewFile(); // 创建新文件
            BufferedWriter out = new BufferedWriter(new FileWriter(writename));
            out.write(stringBuilder.toString()); // \r\n即为换行
            out.flush(); // 把缓存区内容压入文件
            out.close(); // 最后记得关闭文件


        } catch (Exception e) {
            e.printStackTrace();
        }
    }
    public static void main(String[] args) throws IOException {
        // TODO Auto-generated catch block


        PCA pca = new PCA();
        //获得样本集
        double[][] primaryArray =pca.readData() ;
        System.out.println();
        DenseMatrix64F denseMatrix64F=runPCA(new DenseMatrix64F(primaryArray),10);
        writeTxt(denseMatrix64F);
    }


}

最小最大规范化



public class DealData {
    static double[] max=new double[14];
    static double[] min=new double[14];static ArrayList<String> list1=new ArrayList<>();
    public static  List<List<Double>> readTxt(String fileName){
        List<List<Double>> list=new ArrayList<>();

        Arrays.fill(max,Integer.MIN_VALUE);
        Arrays.fill(min,Integer.MAX_VALUE);
        try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw

            /* 读入TXT文件 */

            File filename = new File(fileName); // 要读取以上路径的input。txt文件
            InputStreamReader reader = new InputStreamReader(
                    new FileInputStream(filename)); // 建立一个输入流对象reader
            BufferedReader br = new BufferedReader(reader); // 建立一个对象,它把文件内容转成计算机能读懂的语言
            String line = "";

            line = br.readLine();
            while (line != null) {
                if(line.length()>0){
                String[] temp=line.split(",");
                ArrayList<Double> strings=new ArrayList<>();
                for(int i=0;i<temp.length-1;i++)
                {
                    strings.add(Double.parseDouble(temp[i]));
                    max[i]=Math.max(Double.parseDouble(temp[i]),max[i]);
                    min[i]=Math.min(Double.parseDouble(temp[i]),min[i]);
                }

                   list1.add(temp[temp.length-1]);
                list.add(strings);}
                line = br.readLine();
            }

        } catch (Exception e) {
            e.printStackTrace();
        }


        return list;
    }
    public static void writeTxt(String content){
        try { // 防止文件建立或读取失败,用catch捕捉错误并打印,也可以throw

            /* 读入TXT文件 */

            File writename = new File("src/bp/trainBayes.txt"); // 相对路径,如果没有则要建立一个新的output。txt文件
            writename.createNewFile(); // 创建新文件
            BufferedWriter out = new BufferedWriter(new FileWriter(writename));
            out.write(content); // \r\n即为换行
            out.flush(); // 把缓存区内容压入文件
            out.close(); // 最后记得关闭文件


        } catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void main(String[] args) {
        List<List<Double>> list=   readTxt("src/bp/train(low).txt");
        StringBuilder stringBuilder=new StringBuilder();

        for(int i=0;i<list.size();i++)
        {
          for(int j=0;j<list.get(i).size()-1;j++){
              double gap=Math.ceil((max[j]-min[j])/8);
              stringBuilder.append(Math.round((list.get(i).get(j)-min[j])/gap)).append(',');
          }

            stringBuilder.append(list1.get(i));
            stringBuilder.append("\n");

        }
        writeTxt(stringBuilder.toString());

    }
}

标签:src,get,int,double,分析法,维和,DenseMatrix64F,数据挖掘,new
来源: https://blog.csdn.net/weixin_44560620/article/details/113432483