Scripting - datacube / kmeans

This script was originally written by Adam Taylor, Teresa Murta and Alex Dexter and can be used to automatically generate a mean spectrum, detect peaks, reduce the data to the peaks with signal-to-noise greater than 3, perform k-means clustering (k = 2) on the reduced data, generate mean spectra for each cluster and then save out all variables.

This script demonstrates how SpectralAnalysis can be used without the interface to perform more complex and automatable analysis routines.

spectralAnalysisPath = 'C:\path\to\SpectralAnalysis';

inputFolder = [spectralAnalysisPath filesep 'example-data' filesep 'mouse-brain']; %location of imzML files to process
outputFolder = [spectralAnalysisPath filesep 'example-data' filesep 'mouse-brain'];
filesToProcess = dir([inputFolder filesep '*.imzML']); %gets all imzML files in folder

% Set up datacube generation variables

% Preprocessing file (.sap) location
preprocessingWorkflowFile = [spectralAnalysisPath filesep 'example-data' filesep 'mouse-brain' filesep 'mouse-brain-preprocessingWorkflow.sap']; 
nzm_multiple = 3; % multiple of non zero median

% Add SpectralAnalysis to the path - this only needs to be done once per MATLAB session
disp('Setting up ');
addpath(genpath(spectralAnalysisPath));
addJARsToClassPath();

% Generate preprocessing workflow
preprocessing = PreprocessingWorkflow();
preprocessing.loadWorkflow(preprocessingWorkflowFile);

peakPicking = GradientPeakDetection();
medianPeakFilter = PeakThresholdFilterMedian(1, nzm_multiple);
peakPicking.addPeakFilter(medianPeakFilter);

%%
for i = 1:length(filesToProcess)
    disp(['Processing ' filesToProcess(i).name]);

    input_file = [filesToProcess(i).folder filesep filesToProcess(i).name];

    % Get the filename from the path
    [~, filename, ~] = fileparts(input_file);

    %% make datacubes from each dataset

    % obtain total spectrum
    disp(['Generating Total Spectrum for ' ,input_file]);
    parser = ImzMLParser(input_file);
    parser.parse;
    data = DataOnDisk(parser);

    spectrumGeneration = TotalSpectrum();
    spectrumGeneration.setPreprocessingWorkflow(preprocessing);

    totalSpectrum = spectrumGeneration.process(data);
    totalSpectrum = totalSpectrum.get(1);

    %% Peak picking
    disp('Peak picking ');
    peaks = peakPicking.process(totalSpectrum);
    
    spectralChannels_all = totalSpectrum.spectralChannels;
    spectralChannels = [peaks.centroid];

    %% Make datacube
    disp(['! Generating data cube with ' num2str(length(peaks)) ' peaks...'])

    % If peakTolerance < 0 then the detected peak width is used
    peakTolerance = -1;

    reduction = DatacubeReduction(peakTolerance);
    reduction.setPeakList(peaks);

    % Inform the user whether we are using fast methods for processing (i.e. Java methods)
    addlistener(reduction, 'FastMethods', @(src, canUseFastMethods)disp(['! Using fast Methods?   ' num2str(canUseFastMethods.bool)]));
    
    dataRepresentationList = reduction.process(data);

    % We only requested one data representation, the entire dataset so extract that from the list
    dataRepresentation = dataRepresentationList.get(1);
    % Convert class to struct so that if SpectralAnalysis changes the DataRepresentation class, the data can still be loaded in
    dataRepresentation_struct = dataRepresentation.saveobj();

    datacube = dataRepresentation.data;
    pixels = dataRepresentation.pixels;

    %% K means clustering
    disp('Performing k-means clustering on top 1000 peaks with k = 2 and cosine distance')

    [~, top1000idx] = maxk([peaks.intensity], 1000);
    datacube_small = datacube(:,top1000idx);

    [kmeans_idx, kmeans_c, ~, ~ ] = kmeans(datacube_small, 2, 'distance', 'cosine');

    %% Make mean spectrum
    disp('Saving cluster mean spectra')

    datacube_clust1 = datacube(kmeans_idx == 1,:);
    datacube_clust2 = datacube(kmeans_idx == 2,:);

    mean_intensity_clust1 = mean(datacube_clust1);
    mean_intensity_clust2 = mean(datacube_clust2);
    mean_intensity_all = mean(datacube);

    %% Save all
    disp('Saving files')

    save([outputFolder filesep filename '.mat'], '-struct', 'dataRepresentation_struct', '-v7.3')
    save([outputFolder filesep filename '.mat'], ...
        'peaks', 'spectralChannels_all', 'spectralChannels', 'kmeans_idx', 'kmeans_c', ...
        'top1000idx', 'mean_intensity_clust1', 'mean_intensity_clust2', 'mean_intensity_all',...
        '-append')

    disp([input_file ' complete']);
end