Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import Jama.EigenVectorValue;
- import Jama.EigenvalueDecomposition;
- import Jama.Matrix;
- import java.io.*;
- import java.util.*;
- import java.util.stream.Collectors;
- public class EcoliTest {
- private List<List<Double>> normalizedAttributes;
- private List<Double> averages;
- private double[][] finalData;
- private List<ReducedEntry> extractedAttributes;
- private Set<ReducedEntry> trainSet;
- private Set<ReducedEntry> testSet;
- // fix this to return list of ReducedEntry
- private void getReducedEntries() {
- List<Entry> entries = parseData("data/ecoli_data.txt");
- prepareData(entries);
- double[][] covarianceMatrix = getCovarianceMatrix(normalizedAttributes, averages);
- this.finalData = getFinalData(covarianceMatrix);
- writeFinalDataToFile();
- visualizeData();
- extractAttributes();
- divideSet();
- }
- private double[][] getFinalData(double[][] covarianceMatrix) {
- EigenvalueDecomposition eigenvalueDecomposition = new EigenvalueDecomposition(new Matrix(covarianceMatrix));
- List<EigenVectorValue> eigenVectorValues = new ArrayList<>();
- double[][] eigenVectors = eigenvalueDecomposition.getV().getArray();
- double[] eigenValues = eigenvalueDecomposition.getRealEigenvalues();
- for (int j = 0; j<eigenVectors.length; j++){
- eigenVectorValues.add(new EigenVectorValue(eigenVectors, j, eigenValues[j]));
- }
- Collections.sort(eigenVectorValues, Comparator.comparing(EigenVectorValue::getEigenValue).reversed());
- eigenVectorValues = eigenVectorValues.stream().limit(3).collect(Collectors.toList());
- int m = 3;
- int n = eigenValues.length;
- double[][] featureVector = new double[m][n]; // 8 x 3
- for (int i = 0; i<m; i++){
- for (int j = 0; j<n; j++){
- featureVector[i][j] = eigenVectorValues.get(i).getEigenVector()[j];
- }
- }
- Matrix featureMatrix = new Matrix(featureVector).transpose();
- Matrix dataAdjust = getDataAdjust().transpose();
- Matrix finalData = dataAdjust.times(featureMatrix);
- return finalData.getArray();
- }
- private Matrix getDataAdjust() {
- int m = normalizedAttributes.size();
- int n = normalizedAttributes.get(0).size();
- double[][] temp = new double[m][n];
- for (int i = 0; i<m; i++){
- for (int j = 0; j<n; j++){
- temp[i][j] = normalizedAttributes.get(i).get(j);
- }
- }
- return new Matrix(temp);
- }
- private void printMatrix(double[][] matrix, int m, int n) {
- System.out.println();
- for (int i = 0; i<m; i++){
- for (int j = 0; j<n; j++){
- System.out.printf("%.2f\t\t\t\t", matrix[i][j]);
- }
- System.out.println();
- }
- }
- private List<Entry> parseData(String fileName){
- try(BufferedReader bufferedReader =
- new BufferedReader(new InputStreamReader(new FileInputStream(fileName))) ) {
- return bufferedReader.lines()
- .map(line -> {
- String[] parts = line.split("\\s+");
- return new Entry(parts[0],
- Double.parseDouble(parts[1]),
- Double.parseDouble(parts[2]),
- Double.parseDouble(parts[3]),
- Double.parseDouble(parts[4]),
- Double.parseDouble(parts[5]),
- Double.parseDouble(parts[6]),
- Double.parseDouble(parts[7]),
- parts[8]);})
- .collect(Collectors.toList());
- } catch (IOException e) {
- e.printStackTrace();
- return new ArrayList<>();
- }
- }
- private void prepareData(List<Entry> entries) {
- List<List<Double>> attributes = new ArrayList<>();
- List<Double> averagesPerVector = new ArrayList<>();
- double temporaryAverage;
- List<Double> mcgList = entries.stream()
- .map(Entry::getMcg)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(mcgList);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(mcgList, temporaryAverage));
- List<Double> gvhList = entries.stream()
- .map(Entry::getGvh)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(gvhList);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(gvhList, temporaryAverage));
- List<Double> lipList = entries.stream()
- .map(Entry::getLip)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(lipList);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(lipList, temporaryAverage));
- List<Double> chgList = entries.stream()
- .map(Entry::getChg)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(chgList);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(chgList, temporaryAverage));
- List<Double> aacList = entries.stream()
- .map(Entry::getAac)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(aacList);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(aacList, temporaryAverage));
- List<Double> a1m1List = entries.stream()
- .map(Entry::getA1m1)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(a1m1List);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(a1m1List, temporaryAverage));
- List<Double> a1m2List = entries.stream()
- .map(Entry::getA1m2)
- .collect(Collectors.toList());
- temporaryAverage = getAverage(a1m2List);
- averagesPerVector.add(temporaryAverage);
- attributes.add(normalizeVector(a1m2List, temporaryAverage));
- this.normalizedAttributes = attributes;
- this.averages = averagesPerVector;
- }
- private List<Double> normalizeVector(List<Double> vector, double average) {
- return vector
- .stream()
- .map(number -> number - average)
- .collect(Collectors.toList());
- }
- private double getAverage(List<Double> vector){
- return vector
- .stream()
- .mapToDouble(number -> (double) number)
- .average()
- .orElse(0.0);
- }
- private double[][] getCovarianceMatrix(List<List<Double>> attributes, List<Double> averages) {
- int size = attributes.size();
- double[][] covariance = new double[size][size];
- for (int i = 0; i<size; i++){
- for (int j = i; j<size; j++){
- covariance[i][j] = calculateCovariance(attributes.get(i), attributes.get(j), averages.get(i), averages.get(j));
- }
- }
- return covariance;
- }
- private double calculateCovariance(List<Double> firstAttribute, List<Double> secondAttribute,
- Double firstAttributeAverage, Double secondAttributeAverage) {
- int n = firstAttribute.size();
- double sum = 0;
- for (int i = 0; i < n; i++){
- sum += (firstAttribute.get(i) - firstAttributeAverage) * (secondAttribute.get(i) - secondAttributeAverage);
- }
- sum /= (n-1);
- return sum;
- }
- private void writeFinalDataToFile() {
- int m = finalData.length;
- int n = finalData[0].length;
- try(BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter("pyScripts/ecoli_data_transformed.txt"))){
- for (int i = 0; i<m; i++){
- for (int j = 0; j<n; j++){
- bufferedWriter.write(finalData[i][j] + "");
- if (j < n-1) {
- bufferedWriter.write("\t");
- }
- }
- if (i < m-1) {
- bufferedWriter.write("\n");
- }
- }
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- private void visualizeData() {
- try {
- Runtime.getRuntime().exec("python pyScripts/visualize.py");
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- private void extractAttributes() {
- extractedAttributes = new ArrayList<>();
- for (int i = 0; i<finalData.length; i++){
- extractedAttributes.add(new ReducedEntry(finalData[i][0], finalData[i][1], finalData[i][2]));
- }
- }
- private void divideSet() {
- trainSet = new HashSet<>();
- testSet = new HashSet<>();
- Random random = new Random();
- int n = extractedAttributes.size();
- int testSetSize = (int) (n * 0.3);
- while (testSet.size() < testSetSize){
- testSet.add(extractedAttributes.get(random.nextInt(n)));
- }
- System.out.println("TEST SET: " + testSet.size());
- testSet.forEach(System.out::println);
- for (int i = 0; i<n; i++){
- ReducedEntry re = extractedAttributes.get(i);
- if (!testSet.contains(re)){
- trainSet.add(re);
- }
- }
- System.out.println("TEST SET: " + trainSet.size());
- trainSet.forEach(System.out::println);
- }
- public static void main(String[] args) {
- EcoliTest ecoliTest = new EcoliTest();
- ecoliTest.getReducedEntries();
- }
- }
Advertisement
Add Comment
Please, Sign In to add comment