From e8ad4e96614fea1d6605b7bbd5c2d1a332f5daa2 Mon Sep 17 00:00:00 2001 From: mutantturkey Date: Wed, 19 Sep 2012 09:54:39 -0400 Subject: added README, added ability to specify output folder, and use 10 folds default, and calculate the fold size instead of the inverse --- fly-tools/cci-calculator/LeastSquareSolution.m | 133 +++++++++++++++++++++++++ fly-tools/cci-calculator/README.markdown | 30 ++++++ 2 files changed, 163 insertions(+) create mode 100644 fly-tools/cci-calculator/LeastSquareSolution.m create mode 100644 fly-tools/cci-calculator/README.markdown (limited to 'fly-tools/cci-calculator') diff --git a/fly-tools/cci-calculator/LeastSquareSolution.m b/fly-tools/cci-calculator/LeastSquareSolution.m new file mode 100644 index 0000000..275f822 --- /dev/null +++ b/fly-tools/cci-calculator/LeastSquareSolution.m @@ -0,0 +1,133 @@ +function [x, e] = LeastSquareSolution(fileNameA, fileNameB, output) + inputData = load(fileNameA); + A = inputData; + inputData = load(fileNameB); + b = inputData; + % singular value decomposition + CI = []; + + %testing one sample at a time, using all remaining samples as training data + + total_folds = 10; + fold_size = length(b)/total_folds; + + total_folds = floor(total_folds) + remainder_of_total_folds = length(b) - total_folds*fold_size + elements_per_fold = fold_size; + + solution=[]; + + for i=0:(total_folds-1) + + % debug + file_id_number = num2str((i+1),'%2d'); + + debug_file = strcat(output, '/'); + debug_file = strcat(debug_file, 'Fold_'); + debug_file = strcat(debug_file, file_id_number); + debug_file = strcat(debug_file, '.txt'); + + + + + + if (i == (total_folds-1)) + elements_per_fold = fold_size + remainder_of_total_folds + end + %select one sample at a time for testing using the rest for training + + %if the value is set to 1, that is the sample that will be used for + + %training/testing + + train = ones(length(b),1); %create a column vector of ones + + for k=(i*fold_size+1):((i*fold_size) + elements_per_fold) + train(k) = 0; %set the i-th sample to be the test sample, all others are used for training + end + + train = ismember(train, 1); %converts to logical the train set + + test = ismember(train, 0); %converts to logical the test set + + A_ = A(train,:); % A_ will contain all the data except the test data. This is the train data. + b_ = b(train); % b_ will contain all status except the test status + [m n] = size(A_); + + % do the SVD on the test A data + [U,S,V] = svd(A_); + + % this value should be equal to A_ + U*S*V'; + + + % compute the c from the training b data + c = U'*b_; + + % compute y on from the singular values + y=[]; + for j=1:n + yj = c(j)/S(j,j); + y = [y; yj]; + end + + % compute the unknown x values we are trying to find the least + % square approximation of + x = V*y; + + % the error estimate on the training data + e = A_*x - b_; + + % add the solution to the solution vector + solution = [solution; x'] + + % compute the ci value of the test data + test_data = A(test,:); % this extract just a row vector + size(test_data) + + ci = test_data*x; % compute the ci value for this test sample + + %saving the calculated cis + CI = [CI; ci]; % store the ci values + + + %fid_debug = fopen(debug_file,'w'); + %fprintf(fid_debug, '%s\n','Train data'); + %fprintf(fid_debug, '%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f', A_); +% fprintf(fid_debug, '%s\n','Test data'); +% fprintf(fid_debug, '%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f', test_data); + + fid_debug = fopen(debug_file,'w'); +% fprintf(fid_debug, '%s\n\n','Test data'); + + + fprintf(fid_debug, '%s\n\n','new CI'); + fprintf(fid_debug, '%8.6f\n', ci); + + fprintf(fid_debug, '%s\n\n','actual CI'); + actual_ci = b(test,:); + fprintf(fid_debug, '%8.6f\n', actual_ci); + fprintf(fid_debug, '%s\n\n','error'); + fprintf(fid_debug, '%8.6f\n', (ci-actual_ci)); +% fprintf(fid_debug, '%s\n\n','Train data'); + + + fclose(fid_debug); + + dlmwrite(debug_file, A_, 'delimiter', '\t', '-append'); + + dlmwrite(debug_file, test_data, 'delimiter', '\t', '-append'); + + + end + + % write out the ci values + fid_ci = fopen('newCIs.txt','w'); + fprintf(fid_ci, '%8.6f\n', CI); + fclose(fid_ci); + + %fid_solution_vectors = fopen('Solution_vectors.txt', 'w'); + dlmwrite('Solution_vectors.txt', solution, 'delimiter', '\t'); + %fclose(fid_solution_vectors); + + diff --git a/fly-tools/cci-calculator/README.markdown b/fly-tools/cci-calculator/README.markdown new file mode 100644 index 0000000..065563c --- /dev/null +++ b/fly-tools/cci-calculator/README.markdown @@ -0,0 +1,30 @@ +== This is the least square solution CCI calulator == + +You'll need to run this in matlab, and have prepared your CSV's in advance. You +should have two CSV's. One csv should be all of your concatenated feature +vectors. One feature per column, columns, and as many rows as you have +specimen. The second CSV should include the actual CI data recorded and +calculated by the biologists. + +In matlab the command may look like this: (note you need to be in the directory +to call this) + + [x, e] = LeastSquareSolution('../input/ALL_FVs.csv','ALL_CIs.csv', +../output/); + +Alternatively you can run it without first invoking the compiler by running +this: + + matlab -r "LeastSquareSolution ALL_FV.csv ALL_CIs.csv ../output" + + +The ouput will an output specified, and will be in text files that are named +like this: Fold_1.txt Fold_2.txt etc. You will find it somewhat annoying to have +to concatenate all that data manually so here is a nice one-liner that allows +you to concatenate all of your data. For the number of specimen in each fold, +you need to setup your variables accordingly. + +If we have 21 specimen per fold, use number of folds + 2 for the head argument, +and the number of folds for the tail. This will give you proper results! + + for i in Fold_*.txt; do head $i -n 23 | tail -n 21 >> output.txt; echo $i; done; -- cgit v1.2.3