[pymvpa] PyMVPA 0.5: feature wishlist

Fri Oct 2 11:32:52 UTC 2009

I just remembered another feature I've missed in PyMVPA - this one
might be a tad easier to implement than parallel processing. :)

There are a lot of reports that averaging your examples together
improves classification accuracy (e.g. Mitchell et al., 2004, Machine
Learning). As near as I can tell PyMVPA doesn't have built-in support
for this at the moment.

I wrote some functions to get the job done with my dataset, but the
code is a bit ideosyncratic. Maybe a useful starting point, at least?

Johan

from numpy import *

def resampleChunks(ds,meanover=2):
    '''Returns a new dataset where the chunks have been resampled as
    averages (meanover per average). Only works if the current number
    of chunks is divisible by meanover. Cheerfully assumes that your
    data will be symmetrical about any split.'''
    if mod(len(ds.uniquechunks),meanover):
        raise NameError('Chunks must be divisible by meanover!')
# meanover == number of datasets to split into
    chunkparts = len(ds.uniquechunks)/meanover
    ds_l = []
    inmod=min(ds.uniquechunks) # Compatibility with 0 and 1-based indexing
    for i in range(meanover):
        chunks = arange(inmod+i*chunkparts,inmod+(i+1)*chunkparts)
        ds_t = ds.select(chunks=list(chunks))
        ds_l.append(ds_t.S)
# Dummy dataset to index means into
    ds_new = ds.select(chunks=list(chunks))
    ds_new.setSamplesDType('float64')
    ds_new.S[:] = nan
# Take the mean across the dataset splits
    ds_new.S = mean(ds_l,axis=0)
    return ds_new

def resampleExamples(ds):
    '''Returns a new dataset where the examples within each chunk
    are resampled to a single mean example. Useful for creating a
    mean across a block when analysing raw volumes.'''
    '''experbit = shape(ds.S)[0]/(len(ds.uniquechunks)*len(ds.uniquelabels))
    I = arange(0,shape(ds.S)[0],experbit)
    ds_new = ds.selectSamples(I)
    ds_new.S[:] = nan
    ds_new.setSamplesDType('float64') # Raw data sets tend to be int'''
    S_new = None
    I_list = [] # Indices for building the new dataset
    for c in ds.uniquechunks:
        c_I = array(ds.C==c,dtype='int')
        for l in ds.uniquelabels:
# Labels and chunks
            l_I = array(ds.L==l,dtype='int')
            I_list.append(nonzero(l_I+c_I==2)[0][0]) # First trial
# This isn't pretty
            S = mean(ds.S[c_I+l_I==2],axis=0)
            if S_new == None: S_new = S
            else: S_new = vstack((S_new,S))
    ds_new = ds.selectSamples(I_list)
    ds_new.setSamplesDType('float64')
    ds_new.S[:] = nan
    ds_new.S[:] = S_new
    return ds_new