技术博客重新开张

 人物, 技术博客, 统计之都, 统计备忘录, 胡说  技术博客重新开张已关闭评论
1月 042010
 

把以前在space写的文字都导入到这个新博客里了。

这新得白花花扎眼的一年,还想多写些关于SAS程序员本身的文字,关于这个职业,它依托的行业环境等等。SAS程序员在国内还不是一个很兴盛的职业。

还会有关于SAS本身的文字,关于SAS语言,SAS公司,关于它的创始人等等。最近我对SAS的创始人Tony Barr比较感兴趣。

技术本身,这个跟饭碗相关,除了SAS技术,很多笔墨可能会停留在CDISC上面。当然还会有自个兴之所至的其他文字,才年初呢,啥都没定。作为跟“统计之都”的约定,所有跟统计相关的文字,我会首先发布到“统计之都”,然后在自个的博客做个备份:

http://cos.name/author/hujiangtang/

1月 022010
 
********************************************************;
* A SAS MACRO IMPORTING SQLITE DATA TABLE WITHOUT ODBC *;
* ---------------------------------------------------- *;
* REQUIREMENT:                                         *;
*   HAVE SQLITE3.EXE DIRECTORY IN THE SYSTEM PATH.     *;
* LIMITATIONS:                                         *;
*   1. UNABLE TO HANDLE THE TABLE WITH BLOB DATA TYPE  *;
*   2. THE DEFAULT LENGTH OF ALL CHAR DATA IS 200      *;
*   3. SYSTEM DEPENDENT (CHANGES NEEDED FOR OTHER OS)  *;
*   4. POTENTIALLY SLOW FOR LARGE TABLES               *;
********************************************************;

%let chlen = 200;

options mlogic symbolgen mprint;

%macro sqlite(path = , dbfile = , table = );

%let fileref = %trim(&path)\%trim(&dbfile);

filename fileref "&fileref";

%if %sysfunc(fexist(fileref)) %then %do;

  %let dumpref = %trim(&path)\%trim(dump.txt);

  filename dumpref "&dumpref";

  %if %sysfunc(fexist(dumpref)) %then %do;
    data _null_;
      rc = fdelete("dumpref");
    run;
  %end;

  %let batref = %trim(&path)\%trim(dump.bat);

  filename batref "&batref";
  

  %if %sysfunc(fexist(dumpref)) %then %do;
    data _null_;
      rc = fdelete("batref");
    run;
  %end;

  data _null_;
    file batref;
    cmd = "sqlite3 &fileref "||'"'||".dump &table"||'"'||" > &dumpref";
    put cmd;
  run;
  
  systask command "&batref" wait;  
      
%end;
%else %goto exit;

data create;
  infile dumpref;
  input;
  create = upcase(_infile_);
  if _n_ > 1 then output;
  if substr(create, 1, 2) = ');' then stop;
run;

proc sql noprint;
select
  tranwrd(create, "TEXT", "CHAR(&chlen.)")
into
  :create separated by ' '
from
  create;

&create;
quit;    

proc datasets library = work nolist;
  delete create(mt = data);
run;
quit;

data _null_;
  infile dumpref end = eof;
  input;
  if _n_ = 1 then do;
    call execute("PROC SQL;");
  end;
  if index(upcase(_infile_), 'INSERT INTO') > 0 then do;
    pos1 = index(_infile_, '(');
    pos2 = index(_infile_, ');');    
    insert = substr(_infile_, pos1, pos2 - pos1 + 2);
    call execute("INSERT INTO &TABLE VALUES"||insert);
  end;
  if eof then do;
    call execute("QUIT;");
  end;
run;

data _null_;
  rc = fdelete("batref");
run;

data _null_;
  rc = fdelete("dumpref");
run;

proc contents data = &table varnum;
run;

%exit:
    
%mend sqlite;    

%sqlite(path = D:\sas, dbfile = test.db, table = tblco);
 Posted by at 12:57 下午  Tagged with:
1月 012010
 
################################################
# EXPORT DATA FRAME INTO SQLITE DATABASE       #
# USING SQLDF PACKAGE(code.google.com/p/sqldf) #
################################################

library(sqldf)
library(datasets)
data(CO2)

### SET WORKING DIRECTORY ###
setwd('d:\\r')

### TEST IF THE FILE EXISTS ###
if(file.exists('test.db')) file.remove('test.db')

### CREATE A NEW EMPTY SQLITE DATABASE ###
sqldf("attach 'test.db' as new")

### CREATE A NEW TABLE FROM DATA FRAME ###
sqldf("create table tblco as select * from CO2", dbname = 'test.db')

### SHOW THE DATA IN SQLITE DATABASE ###
sqldf("select * from tblco limit 5", dbname = 'test.db')
#  Plant   Type  Treatment conc uptake
#1   Qn1 Quebec nonchilled   95   16.0
#2   Qn1 Quebec nonchilled  175   30.4
#3   Qn1 Quebec nonchilled  250   34.8
#4   Qn1 Quebec nonchilled  350   37.2
#5   Qn1 Quebec nonchilled  500   35.3
 Posted by at 1:26 下午
12月 262009
 
###############################
# CALCULATE SUMMARY BY GROUPS #
###############################

data(iris)

### WITH AGGREGATE() FUNCTION ###
sum1 <- aggregate(iris[-5], iris[5], mean)
sum1
#     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#1     setosa        5.006       3.428        1.462       0.246
#2 versicolor        5.936       2.770        4.260       1.326
#3  virginica        6.588       2.974        5.552       2.026

### WITH BY() FUNCTION ###
tmp2 <- by(iris[-5], iris[5], mean)
sum2 <- cbind(expand.grid(dimnames(tmp2)), do.call(rbind, tmp2))
rownames(sum2) <- NULL
sum2              
#     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#1     setosa        5.006       3.428        1.462       0.246
#2 versicolor        5.936       2.770        4.260       1.326
#3  virginica        6.588       2.974        5.552       2.026

### WITH RESHAPE PACKAGE ###
library(reshape)
tmp3 <- melt(iris)
sum3 <- cast(tmp3, Species ~ variable, mean)
sum3
#     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#1     setosa        5.006       3.428        1.462       0.246
#2 versicolor        5.936       2.770        4.260       1.326
#3  virginica        6.588       2.974        5.552       2.026

### WITH SQLDF PACKAGE ###
library(sqldf)
sum4 <- sqldf("select
                 Species,
                 avg(Sepal_Length) as Sepal_Length,
                 avg(Sepal_Width)  as Sepal_Width,
                 avg(Petal_Length) as Petal_Length,
                 avg(Petal_Width)  as Petal_Width
               from
                 iris group by Species");
sum4
#     Species Sepal_Length Sepal_Width Petal_Length Petal_Width
#1     setosa        5.006       3.428        1.462       0.246
#2 versicolor        5.936       2.770        4.260       1.326
#3  virginica        6.588       2.974        5.552       2.026

### WITH LAPPLY() FUNCTION ###
tmp5 <- lapply(split(iris[-5], iris[5]), mean)
sum5 <- data.frame(Species = names(tmp5), do.call(rbind, tmp5))
rownames(sum5) <- NULL
sum5 
#     Species Sepal.Length Sepal.Width Petal.Length Petal.Width
#1     setosa        5.006       3.428        1.462       0.246
#2 versicolor        5.936       2.770        4.260       1.326
#3  virginica        6.588       2.974        5.552       2.026
 Posted by at 1:53 下午

From A Logical Point of View

 from a logical point of view, misc, quine, robert mitchum  From A Logical Point of View已关闭评论
12月 212009
 

QUIFRO

This blog, “From A Logical Point of View”, is not supposed to be owned by a logician. Actually, the book, From A Logical Point of View, is a collection of logical and philosophical essays by W.V.Quine(1908-2000), an American philosopher and mathematician.

A story about this book. From A Logical Point of View, was a calypso song by Robert Mitchum, an US actor, composer and singer. Quine enjoyed this music and used it as his new book, which is Quine’s best seller. Now I love this book and give the name to my blog from a logical point of view^.

12月 212009
 
If you have a magazine subscription or you read the magazines at the checkout counter, you know that they are always a month ahead of your calendar. My January 2010 magazines have already arrived and they are all talking about resolutions. Resolutions are a good thing, but shouldn't be limited to January 1. So try this year-long resolution -- challenge your comfort zone once each week. That's only 52 new things for 2010.

We all get in ruts and don't always realize it. These ruts can keep us from having innovative ideas, from meeting interesting people, or from finding a new favorite food. Stepping outside of your comfort zone can make you happier, healthier, and maybe wealthier if your innovative ideas are good enough.

I have started to work on this concept already. Here's my small, fun story. I went to a holiday lunch with a couple of friends. At the last minute, we picked the cafe in the landscaping company next to the resturant for which we had driven downtown. With a little encouragement, I bought a new swag to decorate my mantel. It is beautful and I can't wait for my family to take Christmas pictures in front of it.




My comfort zone challenge continues into the new year, personally and professionally. Over the next few weeks, I'll be finishing up a paper to be presented at SAS Global Forum 2010. This is a big step for many of us, but when you have information to share, the payoff for you and your audience can be well worth the sweaty palms and rapid heartbeat.

But, you don't have to go that far. I've provided a few simple personal and professional suggestions to help you challenge your comfort zone in 2010.

Start small. Try the resturant next door to your favorite resturant. Sit at a different table in your company cafe. Watch a video to learn a new skill instead of using your favorite documentation. Visit your local library and check out an author you have never heard of. Or, try one of these:

12月 162009
 
There is an ancient t-shirt that I have from my first users group conference. It dates from 1995, and it shows a cat holding a floppy disk and the words WUSS (for Western Users of SAS Software) on it. Never mind that I can’t fit into it anymore; Nor can I bear to turn it into a rag for washing my car. It’s a great t-shirt.

Don’t we all have a favorite t-shirt or two? And wouldn’t it be really cool to design something that others love to wear and might outgrow? Well – now’s your chance. There’s a contest on sasCommunity.org that you need to check out. A few quick rules: Original designs only / the t-shirt can be any representation of your use of SAS, the SAS Community in general, sasCommunity.org, or SAS Global Forum. / It can be humorous or serious – just keep it clean and original. And as you might expect, you have to agree to the Legal Mumbo Jumbo to submit. We have 3 submissions already!

It’s actually pretty simple: The winner will be announced on sasCommunity.org on February 8, 2010 and a limited number of the winning t-shirts will be distributed at SAS Global Forum 2010. For the record, I’ll be wearing an XL, unless my New Year’s resolution to lose 15 pounds is successful.

Clustering Handwirtten Digits (digitalized optical images)

 HOSVD, K-means Clustering, PROC PRINCOMP, SVD, Tensor  Clustering Handwirtten Digits (digitalized optical images)已关闭评论
12月 132009
 


In this example, we show the a clustering exercise on Optical Recognition of Handwritten Digits Data Set available @ UCI data set repository (Link).

This exercise is a standard application of HOSVD by stacking 8X8 matrix of digitalized bitmap of each letter where 1-mode is digits for each column and 2-mode corresponds to rows while 3-mode is each letter object, a standard Tensor representation. U_(3) from HOSVD of this Tensor gives indication information for clustering. In this exercise, we apply k-means algorithm on the correlation matrix of each row of U_(3), which corresponds to each subject, with each column of right eigenvector matrix to eliminate outlier effect and stablize the matrix for final clustering algorithm.

The clustering result, together with original unclustered correlation data and correlation data sorted by true letter ID, is shown below. In order to facilitate display, we show the confusion matrix of the correlation matrix so that the clustering result is more obvious to identify.

Cross Tabulation of True Class (0-9) and Clustering Result (1-10):



Further thoughts. The confusion matrix display shows that while the letter clusters have shown up but the pattern is not as prominent as we expected. However, it is possible to obtain better result by applying  Nonnegative Matrix Factorization (NMF) to the square of correlation confusion matrix. The NMF algorithm will be implemented in SAS soon.

Note: All numerical analyses are done using the prototype code demonstrated at this Blog Entry (Link). This figure is drawn in R.



【 Clustering Result shown in Confusion Matrix style 】

Reference:
Matrix Methods in Data Mining and Pattern Recognition by Lars Eldén

Matrix Methods in Data Mining and Pattern Recognition (Fundamentals of Algorithms)
 Posted by at 6:53 上午

Tensor in SAS

 HOSVD, predictive modeling, SVD, Tensor  Tensor in SAS已关闭评论
12月 102009
 


Tensor, a math concept for high order array, is a very useful tool in modern data mining applications. HOSVD, the counter part of SVD in higher order array, is at the heart of modern applications, such as face recognition and clustering, segmentation of multi-dimensional transaction data, etc.


【"Matrix Method in Data Mining and Pattern Recognition" by Lars Eldén】

But implementation of math operations designed for Tensor is not easy, not to mention doing that in SAS. Due to the design nature of SAS data set, everything has to be done in a 1-D or 2-D matrix fashion. Luckily, Tensor operations have their low dimension matrix counterpart, only that matrix subscriptions need to be calculated carefully.

I spent recent couple of days to implement some featured Tensor operations, notably Tensor unfolding (Tensor to Matrix) and HOSVD for my research and work. A prototype code for 3-mode Tensor unfolding and HOSVD SAS code is shown below. Data is from Lathauwer et al., "A MULTILINEAR SINGULAR VALUE DECOMPOSITION", SIAM J. MATRIX ANAL. APPL. vol.21, No.4



data A1;
     input X1-X3;
datalines;
 0.9073  0.8924  2.1488
 0.7158 -0.4898  0.3054
-0.3698  2.4288  2.3753
 1.7842  1.7753  4.2495
 1.6970 -1.5077  0.3207
 0.0151  4.0337  4.7146
 2.1236 -0.6631  1.8260
-0.0740  1.9103  2.1335
 1.4420 -1.7495 -0.2716
;
run;

filename Acat  catalog 'work.tensor.A2.CATAMS';

data _null_;
     file Acat;
     mode='1;';
     dimension='3,3,3;';
     put mode dimension;
run;

%let _dims=2 3 4;
%let _totalmode=3;
%let _mode0=1;
%let _modex=2;
%let _dimension2=6;
%let _dimension1=4;

data _null_;
      infile A2Cat;
      input;  
      _index=trim(scan(_infile_, 1, ';')); put _index=;
      _dimension=trim(scan(_infile_, 2, ';')); put _dimension=;       
      d2=count((_dimension), ',');  put d2=;
      call symput('_totalmode', d2+1);   
      call symput('_dims', translate(_dimension, ' ', ','));
      _dim0=scan(_dimension, &_mode0, ',');
      _dimx=scan(_dimension, &_modex, ',');
      call symput('_dim0', _dim0);
      call symput('_dimx', _dimx);
      call symput('_dim0x', compress(max(_dim0, _dimx)));
      dim2=1; dim1=1;
      do j=1 to d2+1;      
         if j=&_modex then dim1=dim1*scan(_dimension, j, ',');
         else dim2=dim2*scan(_dimension, j, ',');         
      end;
      put dim1= dim2= dim20=;
      call symput('_dimension1', compress(dim1));
      call symput('_dimension2', compress(dim2));   
run;
%put &_totalmode  &_dim0x;
%put &_dimension1  &_dimension2;
%put &_dims;

data  A&_modex;
      array _dim{&_totalmode} _temporary_ (&_dims);
      array _v{&_totalmode} _temporary_;
      array _datavec{&_dim0x} X1-X&_dim0x;
      array _X{&_dimension2, &_dimension1} _temporary_;
      set A&_mode0  end=eof;
      m1=mod(&_mode0+1, &_totalmode); if m1=0 then m1=&_totalmode;
      m2=mod(&_mode0+2, &_totalmode); if m2=0 then m2=&_totalmode;
      _v[m2]=mod(_n_, _dim[m2]); if _v[m2]=0 then _v[m2]=_dim[m2];
      _v[m1]=floor((_n_-_v[m2])/_dim[m2])+1;
      do j=1 to &_dim0;
         _v[&_mode0]=j;
         m1=mod(&_modex+1, &_totalmode); if m1=0 then m1=&_totalmode;
         m2=mod(&_modex+2, &_totalmode); if m2=0 then m2=&_totalmode;  
         _m=_v[&_modex];
         _n=(_v[m1]-1)*_dim[m2]+_v[m2];
         _X[_n, _m]=_datavec[j];
      end;
      if eof then do;
         do j=1 to &_dimension2;
            do k=1 to &_dimension1;
               _datavec[k]=_X[j, k];
            end;
            keep X1-X&_dim0x;
            output;
         end;
   end; 
run;

filename Acat  catalog "work.tensor.A&_modex..CATAMS";
data _null_;
     file Acat;
     mode="&_modex;";
     dimension="3,3,3;";
     put mode dimension;
run;

ods select none;
proc princomp data=A1  outstat=stat_1 noint  cov;
     var X1-X3;
run;
proc princomp data=A2  outstat=stat_2 noint  cov;
     var X1-X3;
run;
proc princomp data=A3  outstat=stat_3 noint  cov;
     var X1-X3;
run;
ods select all;

/***************************************
In order to obtain Tensor core S, we can 
do something like this:
S_(1)=t(U1)%*%A_(1)%*%(kronecker(U2, U3)
where S_(1) is 1-mode matrix of S and A_(1) 
is 1-mode matrix of A
****************************************/

%macro kroneck(mat_A, mat_B, mat_kronecker);
%local dsid row_B col_B  col_A;

%let dsid=%sysfunc(open(&mat_B));
%let row_B=%sysfunc(attrn(&dsid, nobs));
%let col_B=%sysfunc(attrn(&dsid, nvars));
%let dsid=%sysfunc(close(&dsid));
%let dsid=%sysfunc(open(&mat_A));
%let col_A=%sysfunc(attrn(&dsid, nvars));
%let dsid=%sysfunc(close(&dsid));

data _null_;
     col_K=&col_A*&col_B;
     call symput('col_K', compress(col_K));
run;
%put &col_A  &col_B  &col_K &row_B;

proc sql noprint;
     select name into :vars_A separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' and memname=%upcase("&mat_A")
  ;
     select name into :vars_B separated by ' '
  from   sashelp.vcolumn
  where  libname='WORK' and memname=%upcase("&mat_B")
  ;
quit;

data &mat_kronecker;
     array _K{&col_K} K1-K&col_K;
  array _A{&col_A} &vars_A;
  array _B{&col_B} &vars_B;
  array _Atmp{&col_A} _temporary_;
     array _Btmp{&col_B} _temporary_;

  do n1=1 to na;  
     set &mat_A  nobs=na point=n1;
  do j=1 to dim(_A);
           _Atmp[j]=_A[j];
  end;
     do n=1 to nb;
           set &mat_B nobs=nb point=n;
     do j=1 to dim(_B);
              _Btmp[j]=_B[j];
     end;
     do j=1 to &col_K;
        _ib=mod(j, &col_B); if _ib=0 then _ib=&col_B;
        _ia=floor((j-_ib)/&col_B)+1;
     *put _ia= _ib=  j= _A[_ia]= _B[_ib]=;
              _K[j]=_Atmp[_ia]*_Btmp[_ib];
     end;
     keep K1-K&col_K;
     output;
     end;
  end;
  stop;
run;

%mend kronecker;

options mprint  mlogic;
%kroneck(B, A, C);

data U1t(keep=X:);
     set stat_1;
     where _type_='USCORE';
run;
data U2t(keep=X:);
     set stat_2;
     where _type_='USCORE';
run;
data U3t(keep=X:);
     set stat_3;
     where _type_='USCORE';
run;

options nomprint  nomlogic;
%kroneck(U2t, U3t, U23t);

data A1score;
     set A1;
     retain _TYPE_ 'PARMS'; 
     _NAME_=compress('K'||_n_);
run;

proc score data=u1t  score=A1score out=u1A1(keep=K:) type=parms;
     var X1-X3;
run;
data u1A1;
     set u1A1;
     retain _TYPE_ 'PARMS'; 
     _NAME_=compress('X'||_n_);
run;

proc score data=u23t  score=u1A1  out=S1(keep=X:) type=parms;
     var K1-K9;
run;


Reference:
Matrix Methods in Data Mining and Pattern Recognition by Lars Eldén

Matrix Methods in Data Mining and Pattern Recognition (Fundamentals of Algorithms)
 Posted by at 11:16 上午