PCA.py 7.1 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
from __future__ import print_function
from scipy.linalg import svd
from numpy.ma import getmaskarray,masked_where,compress_rows,dot,diag
from numpy.ma import zeros as mzeros
from numpy.ma import array as marray
from numpy import any,nonzero,array,rank,sqrt,eye,ones,arange
from Map2D import Map2D

def standard(d):
    """
    Standardise 2D data along the last dimension.

    Args:
        d(numpy.array float): input data
    """
    d=d.transpose()
    d=(d-d.mean(0))/d.std(0)
    return d.transpose()

def samplemean(d):
    """
    Take out the mean of 2D data along the last dimension.

    Args:
        d(numpy.array float): input data
    """
    d=d.transpose()
    d=d-d.mean(0)
    return d.transpose()

def Varimax(x,norm=None,tol=1.e-10,nit=1000):
    """
    Computex varimax rotation of x.
    Args:
        x (numpy.array float): input data
        norm (optional): normalising function to use on x
        tol (float): error tolerance for iteration exit
        nit (integer): maximum number of iterations

    Returns:
        numpy.arry float): rotation matrix xrot=dot(x,r)
    """
    if norm!=None:
        x=norm(x)
    p,nc=x.shape
    print("\tcomputing varimax rotation for ",nc," components...")
    TT=eye(nc)
    d=-1
    for i in arange(nit):
        z=dot(x,TT)
        B=dot(x.transpose(),
                ( z**3-dot( z,diag(dot(ones([p]),z**2).squeeze()) )/p ))
        u,s,v=svd(B)
        TT=dot(u,v.transpose())
        d2=d
        d=diag(s).sum()
        if d2<d<d2*(1.+tol): break
    print("\t\t...used ",i,"iterations to tolerance level ",d/d2-1)
    return TT

class pca:
    """
    Class for performing princial component analysis.

    Attributes:

       data: input data normalised
       EV: eigen-vectors of data covariance (loadings of the PCs),
           Ut of svd decomposition
       scores: scores of the PCs = EV*data, diag(s)*V of
           svd-decomposition
       Sing: singular values of data SVD decomposition
       Var: variance in the respective component
       VarFrac: fractional variance in the respective component
       squaredDeviations: score squared relative to total variance,
           contribution to variance per sample and component
       m2d: Map2D map to shape and reshape arrays from input data
           shape to 2D structure.
    """


    def __init__(self,data,no=-1,sampledims=-1,norm=standard,reductDim=0,
            rotate=0):
        """
        Computes no-dimensional PCA from input data.
        Input data will be reshaped in 2D with a dimension containing
        the PCA variable space and the PCA sample space.
        Args:
            data (numpy.array float): input data
            sampleDims(integer or list of integers, optional): dimensions of input data that are collapsed to the sample dimension. Defaults to -1.
            norm: normalising function. Defaults to standard.
            reductDim (integer, optional): dimension along which is compressed using masked values, has to be homogenesous in the other dimension. 0 = variable dim, 1 = sampling dim. Defaults to 0.
        """

        # Set up mapping:
        self.m2d=Map2D(data,sampledims,axis=1)
        #reshaping to 2D:
        data=self.m2d(data)
        # mapping in 2D shape:
        if norm !=None: data=norm(data)
        self.data=self._remap(data)
        expand=False
        if any(getmaskarray(data)):
          print("\tcompressing input data...")
          print("\t\tFull size of data:",data.shape)
          #filter out variables that contain masked elements:
          # (e.g. land points):
          print("\t\t\tmasking data...")
          if reductDim==1:
              expand=True
              data=data.transpose()
          Mask=getmaskarray(data).max(1)
          if reductDim==0:
              expand=True # set flag to expand results
          idx1D=nonzero(-Mask)
          print("\t\t\tcompressing...")
          #data=compress_rows(data) # DAMN SLOW,substituted!!!
          clist=[]
          for n,d in enumerate(data):
             Mask[n] or clist.append(d)
          data=marray(clist)
          if reductDim==1: data=data.transpose()
          print("\t\tCompressed size of data:",data.shape)
        #compute svd:
        print("\tcomputing SVD...")
        u,s,v=svd(data,full_matrices=False)
        #scores: (=eigenvector*data)
        if no==-1: no=u.shape[1]
        sv=dot(diag(s[:no]),v[:no,:])
        if rotate>0:
            r=Varimax(u[:,:rotate])
        if expand and reductDim==1:
            print("\texpanding scores...")
            scores=mzeros([no,self.m2d.columns])
            for n,v in enumerate(scores):
                v[idx1D]=sv[n,:]
                scores[n,:]=masked_where(Mask,v.data)
        else:
            scores=sv
        self.scores=self._remapScores(scores)
        if expand and reductDim==0:
            print("\texpanding loadings...")
            EV=mzeros([no,self.m2d.rows])
            u=u[:,:no].transpose()
            for n,pc in enumerate(EV):
                pc[idx1D]=u[n,:]
                EV[n,:]=masked_where(Mask,pc.data)
        else:
            EV=u[:,:no].transpose()
        self.EV=self._remapEV(EV)
        self.Sing=s[:no]
        self.Var=s[:no]**2 #variances of components (eigenvalues)
        self.VarFrac=s[:no]**2/(s**2).sum() #fractions of total variance
        #self.rot*X rotates X on varimax dirs:
        if rotate: self.rot=r.transpose()


    def __call__(self,n):
        """
        Computes scores for nth prinipal component.
        Args:
            n (integer): number of component to compute.
        Returns:
            tuple of float numpy.arrays: eigenvector and score of nth component.
        """
        return self.EV[n,:],self.scores[n,:]


    def squaredDeviations(self):
        """score squared relative to total variance (contribution to
            variance per sample and component)"""
        sD=self.scores()**2/self.V.sum()
        return sD.reshape([self.EV.shape[0]]+list(self.sampleShape))


    def _map(self,data):
        """
        Maps data into 2D shape.

        Args:
            data (numpy.array float): input data.
        Returns:
            numpy.array float: data mapped onto 2D.shape.
        """

        data=self.m2d(data)
        print("Mapped data to 2D shape:",data.shape)
        return data


    def _remap(self,data):
        """
        Remaps data into full shape.

        Args:
            data (numpy.array float): 2D data.
        Returns:
            numpy.array float: data mapped onto full shape.
        """

        data=self.m2d.inv(data)
        print("Remapped data to full shape:",data.shape)
        return data
    def _remapEV(self,EV):
        """
        Remaps Eigenvalues into full shape.

        Args:
            EV (numpy.array float): 2D data.
        Returns:
            numpy.array float: Eigenvalues mapped onto full shape.
        """

        EV=self.m2d.Xinv(EV)
        return EV


    def _remapScores(self,scores):
        """
        Remaps scores into full shape.

        Args:
            scores (numpy.array float): 2D data.
        Returns:
            numpy.array float: scores mapped onto full shape.
        """
        scores=self.m2d.Yinv(scores)
        return scores