aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormutantturkey <mutantturke@gmail.com>2012-09-19 09:54:39 -0400
committermutantturkey <mutantturke@gmail.com>2012-09-19 09:54:39 -0400
commite8ad4e96614fea1d6605b7bbd5c2d1a332f5daa2 (patch)
treeef2929c463721a1301bc09f705faaab6675d24c4
parent5cc9a2db607ac074343d7f18e14c3bb187123530 (diff)
added README, added ability to specify output folder, and use 10 folds default, and calculate the fold size instead of the inverse
-rw-r--r--fly-tools/cci-calculator/LeastSquareSolution.m133
-rw-r--r--fly-tools/cci-calculator/README.markdown30
2 files changed, 163 insertions, 0 deletions
diff --git a/fly-tools/cci-calculator/LeastSquareSolution.m b/fly-tools/cci-calculator/LeastSquareSolution.m
new file mode 100644
index 0000000..275f822
--- /dev/null
+++ b/fly-tools/cci-calculator/LeastSquareSolution.m
@@ -0,0 +1,133 @@
+function [x, e] = LeastSquareSolution(fileNameA, fileNameB, output)
+ inputData = load(fileNameA);
+ A = inputData;
+ inputData = load(fileNameB);
+ b = inputData;
+ % singular value decomposition
+ CI = [];
+
+ %testing one sample at a time, using all remaining samples as training data
+
+ total_folds = 10;
+ fold_size = length(b)/total_folds;
+
+ total_folds = floor(total_folds)
+ remainder_of_total_folds = length(b) - total_folds*fold_size
+ elements_per_fold = fold_size;
+
+ solution=[];
+
+ for i=0:(total_folds-1)
+
+ % debug
+ file_id_number = num2str((i+1),'%2d');
+
+ debug_file = strcat(output, '/');
+ debug_file = strcat(debug_file, 'Fold_');
+ debug_file = strcat(debug_file, file_id_number);
+ debug_file = strcat(debug_file, '.txt');
+
+
+
+
+
+ if (i == (total_folds-1))
+ elements_per_fold = fold_size + remainder_of_total_folds
+ end
+ %select one sample at a time for testing using the rest for training
+
+ %if the value is set to 1, that is the sample that will be used for
+
+ %training/testing
+
+ train = ones(length(b),1); %create a column vector of ones
+
+ for k=(i*fold_size+1):((i*fold_size) + elements_per_fold)
+ train(k) = 0; %set the i-th sample to be the test sample, all others are used for training
+ end
+
+ train = ismember(train, 1); %converts to logical the train set
+
+ test = ismember(train, 0); %converts to logical the test set
+
+ A_ = A(train,:); % A_ will contain all the data except the test data. This is the train data.
+ b_ = b(train); % b_ will contain all status except the test status
+ [m n] = size(A_);
+
+ % do the SVD on the test A data
+ [U,S,V] = svd(A_);
+
+ % this value should be equal to A_
+ U*S*V';
+
+
+ % compute the c from the training b data
+ c = U'*b_;
+
+ % compute y on from the singular values
+ y=[];
+ for j=1:n
+ yj = c(j)/S(j,j);
+ y = [y; yj];
+ end
+
+ % compute the unknown x values we are trying to find the least
+ % square approximation of
+ x = V*y;
+
+ % the error estimate on the training data
+ e = A_*x - b_;
+
+ % add the solution to the solution vector
+ solution = [solution; x']
+
+ % compute the ci value of the test data
+ test_data = A(test,:); % this extract just a row vector
+ size(test_data)
+
+ ci = test_data*x; % compute the ci value for this test sample
+
+ %saving the calculated cis
+ CI = [CI; ci]; % store the ci values
+
+
+ %fid_debug = fopen(debug_file,'w');
+ %fprintf(fid_debug, '%s\n','Train data');
+ %fprintf(fid_debug, '%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f', A_);
+% fprintf(fid_debug, '%s\n','Test data');
+% fprintf(fid_debug, '%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f\t%8.6f', test_data);
+
+ fid_debug = fopen(debug_file,'w');
+% fprintf(fid_debug, '%s\n\n','Test data');
+
+
+ fprintf(fid_debug, '%s\n\n','new CI');
+ fprintf(fid_debug, '%8.6f\n', ci);
+
+ fprintf(fid_debug, '%s\n\n','actual CI');
+ actual_ci = b(test,:);
+ fprintf(fid_debug, '%8.6f\n', actual_ci);
+ fprintf(fid_debug, '%s\n\n','error');
+ fprintf(fid_debug, '%8.6f\n', (ci-actual_ci));
+% fprintf(fid_debug, '%s\n\n','Train data');
+
+
+ fclose(fid_debug);
+
+ dlmwrite(debug_file, A_, 'delimiter', '\t', '-append');
+
+ dlmwrite(debug_file, test_data, 'delimiter', '\t', '-append');
+
+
+ end
+
+ % write out the ci values
+ fid_ci = fopen('newCIs.txt','w');
+ fprintf(fid_ci, '%8.6f\n', CI);
+ fclose(fid_ci);
+
+ %fid_solution_vectors = fopen('Solution_vectors.txt', 'w');
+ dlmwrite('Solution_vectors.txt', solution, 'delimiter', '\t');
+ %fclose(fid_solution_vectors);
+
+
diff --git a/fly-tools/cci-calculator/README.markdown b/fly-tools/cci-calculator/README.markdown
new file mode 100644
index 0000000..065563c
--- /dev/null
+++ b/fly-tools/cci-calculator/README.markdown
@@ -0,0 +1,30 @@
+== This is the least square solution CCI calulator ==
+
+You'll need to run this in matlab, and have prepared your CSV's in advance. You
+should have two CSV's. One csv should be all of your concatenated feature
+vectors. One feature per column, columns, and as many rows as you have
+specimen. The second CSV should include the actual CI data recorded and
+calculated by the biologists.
+
+In matlab the command may look like this: (note you need to be in the directory
+to call this)
+
+ [x, e] = LeastSquareSolution('../input/ALL_FVs.csv','ALL_CIs.csv',
+../output/);
+
+Alternatively you can run it without first invoking the compiler by running
+this:
+
+ matlab -r "LeastSquareSolution ALL_FV.csv ALL_CIs.csv ../output"
+
+
+The ouput will an output specified, and will be in text files that are named
+like this: Fold_1.txt Fold_2.txt etc. You will find it somewhat annoying to have
+to concatenate all that data manually so here is a nice one-liner that allows
+you to concatenate all of your data. For the number of specimen in each fold,
+you need to setup your variables accordingly.
+
+If we have 21 specimen per fold, use number of folds + 2 for the head argument,
+and the number of folds for the tail. This will give you proper results!
+
+ for i in Fold_*.txt; do head $i -n 23 | tail -n 21 >> output.txt; echo $i; done;