python

8月 122011
 
昨天研究了下Redis,做了几个实验。Sina weibo是用的redis,处理key-value数据
afan@ubuntu:~$ tar zxvf redis-2.2.12.tar.gz 
redis-2.2.12/
redis-2.2.12/.gitignore
redis-2.2.12/00-RELEASENOTES
redis-2.2.12/BUGS
redis-2.2.12/CONTRIBUTING
redis-2.2.12/COPYING
redis-2.2.12/Changelog
redis-2.2.12/INSTALL
redis-2.2.12/Makefile
redis-2.2.12/README
redis-2.2.12/TODO
redis-2.2.12/client-libraries/
redis-2.2.12/client-libraries/README
redis-2.2.12/deps/
redis-2.2.12/deps/hiredis/
redis-2.2.12/deps/hiredis/.gitignore
redis-2.2.12/deps/hiredis/COPYING
redis-2.2.12/deps/hiredis/Makefile
redis-2.2.12/deps/hiredis/README.md
redis-2.2.12/deps/hiredis/TODO
redis-2.2.12/deps/hiredis/adapters/
redis-2.2.12/deps/hiredis/adapters/ae.h
redis-2.2.12/deps/hiredis/adapters/libev.h
redis-2.2.12/deps/hiredis/adapters/libevent.h
redis-2.2.12/deps/hiredis/async.c
redis-2.2.12/deps/hiredis/async.h
redis-2.2.12/deps/hiredis/example-ae.c
redis-2.2.12/deps/hiredis/example-libev.c
redis-2.2.12/deps/hiredis/example-libevent.c
redis-2.2.12/deps/hiredis/example.c
redis-2.2.12/deps/hiredis/fmacros.h
redis-2.2.12/deps/hiredis/hiredis.c
redis-2.2.12/deps/hiredis/hiredis.h
redis-2.2.12/deps/hiredis/net.c
redis-2.2.12/deps/hiredis/net.h
redis-2.2.12/deps/hiredis/sds.c
redis-2.2.12/deps/hiredis/sds.h
redis-2.2.12/deps/hiredis/test.c
redis-2.2.12/deps/hiredis/util.h
redis-2.2.12/deps/linenoise/
redis-2.2.12/deps/linenoise/.gitignore
redis-2.2.12/deps/linenoise/Makefile
redis-2.2.12/deps/linenoise/README.markdown
redis-2.2.12/deps/linenoise/example.c
redis-2.2.12/deps/linenoise/linenoise.c
redis-2.2.12/deps/linenoise/linenoise.h
redis-2.2.12/design-documents/
redis-2.2.12/design-documents/REDIS-CLUSTER
redis-2.2.12/design-documents/REDIS-CLUSTER-2
redis-2.2.12/doc/
redis-2.2.12/doc/AppendCommand.html
redis-2.2.12/doc/AppendOnlyFileHowto.html
redis-2.2.12/doc/AuthCommand.html
redis-2.2.12/doc/Benchmarks.html
redis-2.2.12/doc/BgrewriteaofCommand.html
redis-2.2.12/doc/BgsaveCommand.html
redis-2.2.12/doc/BlpopCommand.html
redis-2.2.12/doc/BrpoplpushCommand.html
redis-2.2.12/doc/CommandReference.html
redis-2.2.12/doc/Comparisons.html
redis-2.2.12/doc/ConfigCommand.html
redis-2.2.12/doc/Configuration.html
redis-2.2.12/doc/ConnectionHandlingSidebar.html
redis-2.2.12/doc/ControlCommandsSidebar.html
redis-2.2.12/doc/Credits.html
redis-2.2.12/doc/DbsizeCommand.html
redis-2.2.12/doc/DelCommand.html
redis-2.2.12/doc/DesignPatterns.html
redis-2.2.12/doc/EventLibray.html
redis-2.2.12/doc/ExistsCommand.html
redis-2.2.12/doc/ExpireCommand.html
redis-2.2.12/doc/FAQ.html
redis-2.2.12/doc/Features.html
redis-2.2.12/doc/FlushallCommand.html
redis-2.2.12/doc/FlushdbCommand.html
redis-2.2.12/doc/FromSqlToDataStructures.html
redis-2.2.12/doc/GenericCommandsSidebar.html
redis-2.2.12/doc/GetCommand.html
redis-2.2.12/doc/GetbitCommand.html
redis-2.2.12/doc/GetsetCommand.html
redis-2.2.12/doc/HackingStrings.html
redis-2.2.12/doc/HashCommandsSidebar.html
redis-2.2.12/doc/Hashes.html
redis-2.2.12/doc/HdelCommand.html
redis-2.2.12/doc/HexistsCommand.html
redis-2.2.12/doc/HgetCommand.html
redis-2.2.12/doc/HgetallCommand.html
redis-2.2.12/doc/HincrbyCommand.html
redis-2.2.12/doc/HlenCommand.html
redis-2.2.12/doc/HmgetCommand.html
redis-2.2.12/doc/HmsetCommand.html
redis-2.2.12/doc/HsetCommand.html
redis-2.2.12/doc/HsetnxCommand.html
redis-2.2.12/doc/IncrCommand.html
redis-2.2.12/doc/InfoCommand.html
redis-2.2.12/doc/IntroductionToRedisDataTypes.html
redis-2.2.12/doc/KeysCommand.html
redis-2.2.12/doc/LastsaveCommand.html
redis-2.2.12/doc/LindexCommand.html
redis-2.2.12/doc/ListCommandsSidebar.html
redis-2.2.12/doc/Lists.html
redis-2.2.12/doc/LlenCommand.html
redis-2.2.12/doc/LpopCommand.html
redis-2.2.12/doc/LrangeCommand.html
redis-2.2.12/doc/LremCommand.html
redis-2.2.12/doc/LsetCommand.html
redis-2.2.12/doc/LtrimCommand.html
redis-2.2.12/doc/MgetCommand.html
redis-2.2.12/doc/MonitorCommand.html
redis-2.2.12/doc/MoveCommand.html
redis-2.2.12/doc/MsetCommand.html
redis-2.2.12/doc/MultiExecCommand.html
redis-2.2.12/doc/NonexistentCommands.html
redis-2.2.12/doc/ObjectHashMappers.html
redis-2.2.12/doc/Pipelining.html
redis-2.2.12/doc/ProgrammingExamples.html
redis-2.2.12/doc/ProtocolSpecification.html
redis-2.2.12/doc/PublishSubscribe.html
redis-2.2.12/doc/QuickStart.html
redis-2.2.12/doc/QuitCommand.html
redis-2.2.12/doc/README.html
redis-2.2.12/doc/RandomkeyCommand.html
redis-2.2.12/doc/Redis0100ChangeLog.html
redis-2.2.12/doc/Redis0900ChangeLog.html
redis-2.2.12/doc/RedisBigData.html
redis-2.2.12/doc/RedisCLI.html
redis-2.2.12/doc/RedisEventLibrary.html
redis-2.2.12/doc/RedisGuides.html
redis-2.2.12/doc/RedisInternals.html
redis-2.2.12/doc/RedisPipelining.html
redis-2.2.12/doc/RedisStatus.html
redis-2.2.12/doc/Redis_1_2_0_Changelog.html
redis-2.2.12/doc/Redis_2_0_0_Changelog.html
redis-2.2.12/doc/Redis_2_0_Whats_new.html
redis-2.2.12/doc/RenameCommand.html
redis-2.2.12/doc/RenamenxCommand.html
redis-2.2.12/doc/ReplicationHowto.html
redis-2.2.12/doc/ReplyTypes.html
redis-2.2.12/doc/RoadMap.html
redis-2.2.12/doc/RpoplpushCommand.html
redis-2.2.12/doc/RpushCommand.html
redis-2.2.12/doc/SaddCommand.html
redis-2.2.12/doc/SaveCommand.html
redis-2.2.12/doc/ScardCommand.html
redis-2.2.12/doc/SdiffCommand.html
redis-2.2.12/doc/SdiffstoreCommand.html
redis-2.2.12/doc/SelectCommand.html
redis-2.2.12/doc/SetCommand.html
redis-2.2.12/doc/SetCommandsSidebar.html
redis-2.2.12/doc/SetbitCommand.html
redis-2.2.12/doc/SetexCommand.html
redis-2.2.12/doc/SetnxCommand.html
redis-2.2.12/doc/SetrangeCommand.html
redis-2.2.12/doc/Sets.html
redis-2.2.12/doc/ShutdownCommand.html
redis-2.2.12/doc/SideBar.html
redis-2.2.12/doc/SinterCommand.html
redis-2.2.12/doc/SinterstoreCommand.html
redis-2.2.12/doc/SismemberCommand.html
redis-2.2.12/doc/SlaveofCommand.html
redis-2.2.12/doc/SmembersCommand.html
redis-2.2.12/doc/SmoveCommand.html
redis-2.2.12/doc/SortCommand.html
redis-2.2.12/doc/SortedSetCommandsSidebar.html
redis-2.2.12/doc/SortedSets.html
redis-2.2.12/doc/Speed.html
redis-2.2.12/doc/SponsorshipHistory.html
redis-2.2.12/doc/SpopCommand.html
redis-2.2.12/doc/SrandmemberCommand.html
redis-2.2.12/doc/SremCommand.html
redis-2.2.12/doc/StringCommandsSidebar.html
redis-2.2.12/doc/Strings.html
redis-2.2.12/doc/StrlenCommand.html
redis-2.2.12/doc/SubstrCommand.html
redis-2.2.12/doc/SunionCommand.html
redis-2.2.12/doc/SunionstoreCommand.html
redis-2.2.12/doc/SupportedLanguages.html
redis-2.2.12/doc/SupportedPlatforms.html
redis-2.2.12/doc/TemplateCommand.html
redis-2.2.12/doc/TtlCommand.html
redis-2.2.12/doc/TwitterAlikeExample.html
redis-2.2.12/doc/TypeCommand.html
redis-2.2.12/doc/UnstableSource.html
redis-2.2.12/doc/VirtualMemorySpecification.html
redis-2.2.12/doc/VirtualMemoryUserGuide.html
redis-2.2.12/doc/ZaddCommand.html
redis-2.2.12/doc/ZcardCommand.html
redis-2.2.12/doc/ZincrbyCommand.html
redis-2.2.12/doc/ZrangeCommand.html
redis-2.2.12/doc/ZrangebyscoreCommand.html
redis-2.2.12/doc/ZrankCommand.html
redis-2.2.12/doc/ZremCommand.html
redis-2.2.12/doc/ZremrangebyrankCommand.html
redis-2.2.12/doc/ZremrangebyscoreCommand.html
redis-2.2.12/doc/ZscoreCommand.html
redis-2.2.12/doc/ZunionCommand.html
redis-2.2.12/doc/ZunionstoreCommand.html
redis-2.2.12/doc/index.html
redis-2.2.12/doc/redis.png
redis-2.2.12/doc/style.css
redis-2.2.12/redis.conf
redis-2.2.12/src/
redis-2.2.12/src/Makefile
redis-2.2.12/src/adlist.c
redis-2.2.12/src/adlist.h
redis-2.2.12/src/ae.c
redis-2.2.12/src/ae.h
redis-2.2.12/src/ae_epoll.c
redis-2.2.12/src/ae_kqueue.c
redis-2.2.12/src/ae_select.c
redis-2.2.12/src/anet.c
redis-2.2.12/src/anet.h
redis-2.2.12/src/aof.c
redis-2.2.12/src/config.c
redis-2.2.12/src/config.h
redis-2.2.12/src/db.c
redis-2.2.12/src/debug.c
redis-2.2.12/src/dict.c
redis-2.2.12/src/dict.h
redis-2.2.12/src/fmacros.h
redis-2.2.12/src/help.h
redis-2.2.12/src/intset.c
redis-2.2.12/src/intset.h
redis-2.2.12/src/lzf.h
redis-2.2.12/src/lzfP.h
redis-2.2.12/src/lzf_c.c
redis-2.2.12/src/lzf_d.c
redis-2.2.12/src/mkreleasehdr.sh
redis-2.2.12/src/multi.c
redis-2.2.12/src/networking.c
redis-2.2.12/src/object.c
redis-2.2.12/src/pqsort.c
redis-2.2.12/src/pqsort.h
redis-2.2.12/src/pubsub.c
redis-2.2.12/src/rdb.c
redis-2.2.12/src/redis-benchmark.c
redis-2.2.12/src/redis-check-aof.c
redis-2.2.12/src/redis-check-dump.c
redis-2.2.12/src/redis-cli.c
redis-2.2.12/src/redis.c
redis-2.2.12/src/redis.h
redis-2.2.12/src/release.c
redis-2.2.12/src/replication.c
redis-2.2.12/src/sds.c
redis-2.2.12/src/sds.h
redis-2.2.12/src/sha1.c
redis-2.2.12/src/sha1.h
redis-2.2.12/src/slowlog.c
redis-2.2.12/src/slowlog.h
redis-2.2.12/src/solarisfixes.h
redis-2.2.12/src/sort.c
redis-2.2.12/src/syncio.c
redis-2.2.12/src/t_hash.c
redis-2.2.12/src/t_list.c
redis-2.2.12/src/t_set.c
redis-2.2.12/src/t_string.c
redis-2.2.12/src/t_zset.c
redis-2.2.12/src/testhelp.h
redis-2.2.12/src/util.c
redis-2.2.12/src/valgrind.sup
redis-2.2.12/src/version.h
redis-2.2.12/src/vm.c
redis-2.2.12/src/ziplist.c
redis-2.2.12/src/ziplist.h
redis-2.2.12/src/zipmap.c
redis-2.2.12/src/zipmap.h
redis-2.2.12/src/zmalloc.c
redis-2.2.12/src/zmalloc.h
redis-2.2.12/tests/
redis-2.2.12/tests/assets/
redis-2.2.12/tests/assets/default.conf
redis-2.2.12/tests/integration/
redis-2.2.12/tests/integration/aof.tcl
redis-2.2.12/tests/integration/redis-cli.tcl
redis-2.2.12/tests/integration/replication.tcl
redis-2.2.12/tests/support/
redis-2.2.12/tests/support/redis.tcl
redis-2.2.12/tests/support/server.tcl
redis-2.2.12/tests/support/test.tcl
redis-2.2.12/tests/support/tmpfile.tcl
redis-2.2.12/tests/support/util.tcl
redis-2.2.12/tests/test_helper.tcl
redis-2.2.12/tests/tmp/
redis-2.2.12/tests/tmp/.gitignore
redis-2.2.12/tests/unit/
redis-2.2.12/tests/unit/auth.tcl
redis-2.2.12/tests/unit/basic.tcl
redis-2.2.12/tests/unit/cas.tcl
redis-2.2.12/tests/unit/expire.tcl
redis-2.2.12/tests/unit/other.tcl
redis-2.2.12/tests/unit/printver.tcl
redis-2.2.12/tests/unit/protocol.tcl
redis-2.2.12/tests/unit/pubsub.tcl
redis-2.2.12/tests/unit/quit.tcl
redis-2.2.12/tests/unit/slowlog.tcl
redis-2.2.12/tests/unit/sort.tcl
redis-2.2.12/tests/unit/type/
redis-2.2.12/tests/unit/type/hash.tcl
redis-2.2.12/tests/unit/type/list.tcl
redis-2.2.12/tests/unit/type/set.tcl
redis-2.2.12/tests/unit/type/zset.tcl
redis-2.2.12/utils/
redis-2.2.12/utils/build-static-symbols.tcl
redis-2.2.12/utils/generate-command-help.rb
redis-2.2.12/utils/mktarball.sh
redis-2.2.12/utils/redis-copy.rb
redis-2.2.12/utils/redis-sha1.rb
redis-2.2.12/utils/redis_init_script
afan@ubuntu:~$ cd redis-2.2.12/
afan@ubuntu:~/redis-2.2.12$ ls
00-RELEASENOTES   CONTRIBUTING      doc       redis.conf  utils
BUGS              COPYING           INSTALL   src
Changelog         deps              Makefile  tests
client-libraries  design-documents  README    TODO

安装redis
afan@ubuntu:~/redis-2.2.12$ make
cd src && make all
make[1]: 正在进入目录 `/home/afan/redis-2.2.12/src'
MAKE hiredis
cd ../deps/hiredis && make static ARCH=""
make[2]: 正在进入目录 `/home/afan/redis-2.2.12/deps/hiredis'
cc -c -std=c99 -pedantic -O3 -fPIC -Wall -W -Wwrite-strings    -g -ggdb  net.c
cc -c -std=c99 -pedantic -O3 -fPIC -Wall -W -Wwrite-strings    -g -ggdb  hiredis.c
cc -c -std=c99 -pedantic -O3 -fPIC -Wall -W -Wwrite-strings    -g -ggdb  sds.c
cc -c -std=c99 -pedantic -O3 -fPIC -Wall -W -Wwrite-strings    -g -ggdb  async.c
ar rcs libhiredis.a net.o hiredis.o sds.o async.o
make[2]:正在离开目录 `/home/afan/redis-2.2.12/deps/hiredis'
MAKE linenoise
cd ../deps/linenoise && make ARCH=""
make[2]: 正在进入目录 `/home/afan/redis-2.2.12/deps/linenoise'
cc  -c -Wall -W -Os -g linenoise.c
cc  -c -Wall -W -Os -g example.c
cc  -Wall -W -Os -g -o linenoise_example linenoise.o example.o
make[2]:正在离开目录 `/home/afan/redis-2.2.12/deps/linenoise'
    CC ae.o
    CC anet.o
    CC redis-benchmark.o
    CC sds.o
    CC adlist.o
    CC zmalloc.o
MAKE hiredis
cd ../deps/hiredis && make static ARCH=""
make[2]: 正在进入目录 `/home/afan/redis-2.2.12/deps/hiredis'
make[2]: 没有什么可以做的为 `static'。
make[2]:正在离开目录 `/home/afan/redis-2.2.12/deps/hiredis'
    LINK redis-benchmark
    CC redis-cli.o
    CC release.o
    LINK redis-cli
    CC redis-check-dump.o
    CC lzf_c.o
    CC lzf_d.o
    LINK redis-check-dump
    CC redis-check-aof.o
    LINK redis-check-aof
    CC dict.o
    CC redis.o
    CC pqsort.o
    CC zipmap.o
    CC sha1.o
    CC ziplist.o
    CC networking.o
    CC util.o
    CC object.o
    CC db.o
    CC replication.o
    CC rdb.o
    CC t_string.o
    CC t_list.o
    CC t_set.o
    CC t_zset.o
    CC t_hash.o
    CC config.o
    CC aof.o
    CC vm.o
    CC pubsub.o
    CC multi.o
    CC debug.o
    CC sort.o
    CC intset.o
    CC syncio.o
    CC slowlog.o
    LINK redis-server

Hint: To run 'make test' is a good idea ;)

make[1]:正在离开目录 `/home/afan/redis-2.2.12/src'
afan@ubuntu:~/redis-2.2.12$ 

启动server
afan@ubuntu:~/redis-2.2.12/src$ ./redis-server
[2485] 12 Aug 09:41:22 # Warning: no config file specified, using the default config. In order to specify a config file use 'redis-server /path/to/redis.conf'
[2485] 12 Aug 09:41:22 * Server started, Redis version 2.2.12
[2485] 12 Aug 09:41:22 # WARNING overcommit_memory is set to 0! Background save may fail under low memory condition. To fix this issue add 'vm.overcommit_memory = 1' to /etc/sysctl.conf and then reboot or run the command 'sysctl vm.overcommit_memory=1' for this to take effect.
[2485] 12 Aug 09:41:22 * The server is now ready to accept connections on port 6379
[2485] 12 Aug 09:41:22 - 0 clients connected (0 slaves), 539544 bytes in use
[2485] 12 Aug 09:41:27 - 0 clients connected (0 slaves), 539544 bytes in use

测试客户端
afan@ubuntu:~/redis-2.2.12/src$ ./redis-cli
redis 127.0.0.1:6379> set name afan
OK
redis 127.0.0.1:6379> get name
"afan"




通过python客户端调用
redis 127.0.0.1:6379> 
afan@ubuntu:~$ python
Python 2.7.1+ (r271:86832, Apr 11 2011, 18:05:24) 
[GCC 4.5.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import redis
>>> r_server = redis.Redis("localhost")
>>> r_server.set("name", "afan")
True
>>> r_server.get("name")
'afan'
>>> 


模拟google+ circle功能
建立两个圈子family和soccer
redis 127.0.0.1:6379> sadd circle:jdoe:family users:anna
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:family users:richard
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:family users:mike
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:soccer users:mike
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:soccer users:adam
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:soccer users:toby
(integer) 1
redis 127.0.0.1:6379> sadd circle:jdoe:soccer users:apollo
(integer) 1
计算成员
redis 127.0.0.1:6379> smembers circle:jdoe:family
1) "users:richard"
2) "users:mike"
3) "users:anna"

redis 127.0.0.1:6379> sinter circle:jdoe:family circle:jdoe:soccer
1) "users:mike"
redis 127.0.0.1:6379> sunion circle:jdoe:family circle:jdoe:soccer
1) "users:anna"
2) "users:mike"
3) "users:apollo"
4) "users:adam"
5) "users:richard"
6) "users:toby"

这样,可以很方便的模拟google+的circle功能
参考:


 Posted by at 1:29 下午
8月 102011
 
Ubuntu
测试代码:
#!/usr/bin/env python
import sys
import datetime
# area of space to investigate
x1, x2, y1, y2 = -2.13, 0.77, -1.3, 1.3

# Original code, prints progress (because it is slow)
# Uses complex datatype

def calculate_z_serial_purepython(q, maxiter, z):
    """Pure python with complex datatype, iterating over list of q and z"""
    output = [0] * len(q)
    for i in range(len(q)):
        if i % 1000 == 0:
            # print out some progress info since it is so slow...
            print "%0.2f%% complete" % (1.0/len(q) * i * 100)
        for iteration in range(maxiter):
            z[i] = z[i]*z[i] + q[i]
            if abs(z[i]) > 2.0:
                output[i] = iteration
                break
    return output

def calc_pure_python(show_output):
    # make a list of x and y values which will represent q
    # xx and yy are the co-ordinates, for the default configuration they'll look like:
    # if we have a 1000x1000 plot
    # xx = [-2.13, -2.1242, -2.1184000000000003, ..., 0.7526000000000064, 0.7584000000000064, 0.7642000000000064]
    # yy = [1.3, 1.2948, 1.2895999999999999, ..., -1.2844000000000058, -1.2896000000000059, -1.294800000000006]
    x_step = (float(x2 - x1) / float(w)) * 2
    y_step = (float(y1 - y2) / float(h)) * 2
    x=[]
    y=[]
    ycoord = y2
    while ycoord > y1:
        y.append(ycoord)
        ycoord += y_step
    xcoord = x1
    while xcoord < x2:
        x.append(xcoord)
        xcoord += x_step
    q = []
    for ycoord in y:
        for xcoord in x:
            q.append(complex(xcoord,ycoord))

    z = [0+0j] * len(q)
    print "Total elements:", len(z)
    start_time = datetime.datetime.now()
    output = calculate_z_serial_purepython(q, maxiter, z)
    end_time = datetime.datetime.now()
    secs = end_time - start_time
    print "Main took", secs

    validation_sum = sum(output)
    print "Total sum of elements (for validation):", validation_sum

    if show_output:
        try:
            import Image
            import numpy as nm
            output = nm.array(output)
            output = (output + (256*output) + (256**2)*output) * 8
            im = Image.new("RGB", (w/2, h/2))
            im.fromstring(output.tostring(), "raw", "RGBX", 0, -1)
            im.show()
        except ImportError as err:
            # Bail gracefully if we're using PyPy
            print "Couldn't import Image or numpy:", str(err)

if __name__ == "__main__":
    # get width, height and max iterations from cmd line
    # 'python mandelbrot_pypy.py 100 300'
    w = int(sys.argv[1]) # e.g. 100
    h = int(sys.argv[1]) # e.g. 100
    maxiter = int(sys.argv[2]) # e.g. 300
    
    # we can show_output for Python, not for PyPy
    calc_pure_python(True)

cProfile参考

afan@ubuntu:~/python-code$ python -m cProfile -o rep.prof pure_python.py 1000 1000
Total elements: 250000
0.00% complete
0.40% complete
0.80% complete
1.20% complete
1.60% complete
2.00% complete
2.40% complete
2.80% complete
3.20% complete
3.60% complete
4.00% complete
4.40% complete
4.80% complete
5.20% complete
5.60% complete
6.00% complete
6.40% complete
6.80% complete
7.20% complete
7.60% complete
8.00% complete
8.40% complete
8.80% complete
9.20% complete
9.60% complete
10.00% complete
10.40% complete
10.80% complete
11.20% complete
11.60% complete
12.00% complete
12.40% complete
12.80% complete
13.20% complete
13.60% complete
14.00% complete
14.40% complete
14.80% complete
15.20% complete
15.60% complete
16.00% complete
16.40% complete
16.80% complete
17.20% complete
17.60% complete
18.00% complete
18.40% complete
18.80% complete
19.20% complete
19.60% complete
20.00% complete
20.40% complete
20.80% complete
21.20% complete
21.60% complete
22.00% complete
22.40% complete
22.80% complete
23.20% complete
23.60% complete
24.00% complete
24.40% complete
24.80% complete
25.20% complete
25.60% complete
26.00% complete
26.40% complete
26.80% complete
27.20% complete
27.60% complete
28.00% complete
28.40% complete
28.80% complete
29.20% complete
29.60% complete
30.00% complete
30.40% complete
30.80% complete
31.20% complete
31.60% complete
32.00% complete
32.40% complete
32.80% complete
33.20% complete
33.60% complete
34.00% complete
34.40% complete
34.80% complete
35.20% complete
35.60% complete
36.00% complete
36.40% complete
36.80% complete
37.20% complete
37.60% complete
38.00% complete
38.40% complete
38.80% complete
39.20% complete
39.60% complete
40.00% complete
40.40% complete
40.80% complete
41.20% complete
41.60% complete
42.00% complete
42.40% complete
42.80% complete
43.20% complete
43.60% complete
44.00% complete
44.40% complete
44.80% complete
45.20% complete
45.60% complete
46.00% complete
46.40% complete
46.80% complete
47.20% complete
47.60% complete
48.00% complete
48.40% complete
48.80% complete
49.20% complete
49.60% complete
50.00% complete
50.40% complete
50.80% complete
51.20% complete
51.60% complete
52.00% complete
52.40% complete
52.80% complete
53.20% complete
53.60% complete
54.00% complete
54.40% complete
54.80% complete
55.20% complete
55.60% complete
56.00% complete
56.40% complete
56.80% complete
57.20% complete
57.60% complete
58.00% complete
58.40% complete
58.80% complete
59.20% complete
59.60% complete
60.00% complete
60.40% complete
60.80% complete
61.20% complete
61.60% complete
62.00% complete
62.40% complete
62.80% complete
63.20% complete
63.60% complete
64.00% complete
64.40% complete
64.80% complete
65.20% complete
65.60% complete
66.00% complete
66.40% complete
66.80% complete
67.20% complete
67.60% complete
68.00% complete
68.40% complete
68.80% complete
69.20% complete
69.60% complete
70.00% complete
70.40% complete
70.80% complete
71.20% complete
71.60% complete
72.00% complete
72.40% complete
72.80% complete
73.20% complete
73.60% complete
74.00% complete
74.40% complete
74.80% complete
75.20% complete
75.60% complete
76.00% complete
76.40% complete
76.80% complete
77.20% complete
77.60% complete
78.00% complete
78.40% complete
78.80% complete
79.20% complete
79.60% complete
80.00% complete
80.40% complete
80.80% complete
81.20% complete
81.60% complete
82.00% complete
82.40% complete
82.80% complete
83.20% complete
83.60% complete
84.00% complete
84.40% complete
84.80% complete
85.20% complete
85.60% complete
86.00% complete
86.40% complete
86.80% complete
87.20% complete
87.60% complete
88.00% complete
88.40% complete
88.80% complete
89.20% complete
89.60% complete
90.00% complete
90.40% complete
90.80% complete
91.20% complete
91.60% complete
92.00% complete
92.40% complete
92.80% complete
93.20% complete
93.60% complete
94.00% complete
94.40% complete
94.80% complete
95.20% complete
95.60% complete
96.00% complete
96.40% complete
96.80% complete
97.20% complete
97.60% complete
98.00% complete
98.40% complete
98.80% complete
99.20% complete
99.60% complete
Main took 0:02:34.268042
Total sum of elements (for validation): 1147734
afan@ubuntu:~/python-code$ python
Python 2.7.1+ (r271:86832, Apr 11 2011, 18:05:24) 
[GCC 4.5.2] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import pstats
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ImportError: No module named pstats

ubuntu安装python pstats模块
afan@ubuntu:~$ sudo apt-get install python-profiler
[sudo] password for afan: 
正在读取软件包列表... 完成
正在分析软件包的依赖关系树       
正在读取状态信息... 完成       
建议安装的软件包:
  python-doc
下列【新】软件包将被安装:
  python-profiler
升级了 0 个软件包,新安装了 1 个软件包,要卸载 0 个软件包,有 122 个软件包未被升级。
需要下载 41.2 kB 的软件包。
解压缩后会消耗掉 233 kB 的额外空间。
获取:1 http://us.archive.ubuntu.com/ubuntu/ natty/multiverse python-profiler all 2.6.6-0ubuntu1 [41.2 kB]
下载 41.2 kB,耗时 2秒 (19.1 kB/s)         
选中了曾被取消选择的软件包 python-profiler。
(正在读取数据库 ... 系统当前共安装有 209452 个文件和目录。)
正在解压缩 python-profiler (从 .../python-profiler_2.6.6-0ubuntu1_all.deb) ...
正在设置 python-profiler (2.6.6-0ubuntu1) ...
>>> import pstats
>>> p = pstats.Stats('rep.prof')
>>> p.sort_stats('cumulative').print_stats(10)
Thu Jul 28 12:00:35 2011    rep.prof

         51927078 function calls (51926888 primitive calls) in 157.339 CPU seconds

   Ordered by: cumulative time
   List reduced from 535 to 10 due to restriction <10>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.037    0.037  157.343  157.343 pure_python.py:1(<module>)
        1    0.459    0.459  157.306  157.306 pure_python.py:23(calc_pure_python)
        1   94.657   94.657  154.268  154.268 pure_python.py:9(calculate_z_serial_purepython)
 51414667   56.833    0.000   56.833    0.000 {abs}
   250073    2.778    0.000    2.778    0.000 {range}
        1    0.096    0.096    1.986    1.986 /usr/local/lib/python2.7/dist-packages/numpy/__init__.py:106(<module>)
        1    0.026    0.026    1.662    1.662 /usr/local/lib/python2.7/dist-packages/numpy/add_newdocs.py:9(<module>)
        1    0.075    0.075    1.620    1.620 /usr/local/lib/python2.7/dist-packages/numpy/lib/__init__.py:1(<module>)
        1    0.052    0.052    1.029    1.029 /usr/local/lib/python2.7/dist-packages/numpy/lib/type_check.py:3(<module>)
        1    0.366    0.366    0.977    0.977 /usr/local/lib/python2.7/dist-packages/numpy/core/__init__.py:2(<module>)

<pstats.Stats instance at 0xb773ef2c>

afan@ubuntu:~/python-code$ runsnake rep.prof
查看代码性能
afan@ubuntu:~/python-code$ runsnake rep.prof


 Posted by at 3:56 下午
7月 072011
 
import sys

if len(sys.argv) != 2 :
    print 'usage: python quad_file.py file.txt'
    sys.exit( 1 )

def fn( x ) :
    """ return fn(x) = 2x**2 - 4x + 2 """
    return  x * ( 2*x - 4 ) + 2

for x in file( sys.argv[1] ) :
    try :
        # convert the string to a float
        fx = float(x)
        print fx, fn( fx )
    except ValueError :
        print x, 'cannot be converted'

afan@ubuntu:~/python-code$ python quad_file.py data.txt 
1.0 0.0
2.0 2.0
4.0 18.0
8.0 98.0
16.0 450.0
#!/usr/bin/env python
import sys

if len(sys.argv) <= 1 :
    print "usage:", sys.argv[0], "files ... "
    sys.exit(1)

# collate the data into the entries dictionary
entries = {} # empty dictionary
for fn in sys.argv[1:] : 
    f = file( fn )
    for l in f :
        (name,weight) = l.split()
        if name in entries :
            l = entries[ name ]
            l.append( float( weight ) )
        else :
            entries[name] = [ float( weight ) ]
    f.close()

names = entries.keys()
names.sort()

for n in names :
    print "%-10s:" % n,
    for w in entries[ n ] :
        print " %7.2f" % w, 
    print
afan@ubuntu:~/python-code$ python collate.py rat-weight 
r1        :    50.00    61.00    71.00
r2        :    55.00    64.00    76.00
r3        :    70.00    77.00    85.00
r4        :    65.00    75.00    83.00
 Posted by at 7:46 下午
5月 112011
 
CREATE DATABASE `menu`;
USE menu;
DROP TABLE IF EXISTS `fish`;
SET @saved_cs_client = @@character_set_client;
SET character_set_client = utf8;
CREATE TABLE `fish` (
`ID` int(11) NOT NULL auto_increment,
`NAME` varchar(30) NOT NULL default ‘’,
`PRICE` decimal(5,2) NOT NULL default ‘0.00’,
PRIMARY KEY (`ID`)
) ENGINE=MyISAM AUTO_INCREMENT=27 DEFAULT CHARSET=latin1;
SET character_set_client = @saved_cs_client;
LOCK TABLES `fish` WRITE;
INSERT INTO `fish` VALUES (1,’catfish’,’8.50’),(2,’catfish’,’8.50’),(3,’tuna’,’8.00’),(4,’catfish’,’5.00’),(5,’bass’,’6.75’),(6,’haddock’,’6.50’),(7,’salmon’,’9.50’),(8,’trout’,’6.00’),(9,’tuna’,’7.50’),(10,’yellowfin tuna’,’12.00’),(11,’yellowfin tuna’,’13.00’),(12,’tuna’,’7.50’);
UNLOCK TABLES;


mysql> show tables;
+----------------+
| Tables_in_menu |
+----------------+
| fish           |
+----------------+
1 row in set (0.00 sec)

mysql> select * from fish;
+----+----------------+-------+
| ID | NAME           | PRICE |
+----+----------------+-------+
|  1 | catfish        |  8.50 |
|  2 | catfish        |  8.50 |
|  3 | tuna           |  8.00 |
|  4 | catfish        |  5.00 |
|  5 | bass           |  6.75 |
|  6 | haddock        |  6.50 |
|  7 | salmon         |  9.50 |
|  8 | trout          |  6.00 |
|  9 | tuna           |  7.50 |
| 10 | yellowfin tuna | 12.00 |
| 11 | yellowfin tuna | 13.00 |
| 12 | tuna           |  7.50 |
+----+----------------+-------+
12 rows in set (0.03 sec)

mysql> describe fish;
+-------+--------------+------+-----+---------+----------------+
| Field | Type         | Null | Key | Default | Extra          |
+-------+--------------+------+-----+---------+----------------+
| ID    | int(11)      | NO   | PRI | NULL    | auto_increment |
| NAME  | varchar(30)  | NO   |     |         |                |
| PRICE | decimal(5,2) | NO   |     | 0.00    |                |
+-------+--------------+------+-----+---------+----------------+
3 rows in set (0.01 sec)

mysql> select * from fish where id = 5;
+----+------+-------+
| ID | NAME | PRICE |
+----+------+-------+
|  5 | bass |  6.75 |
+----+------+-------+
1 row in set (0.00 sec)

query.py

import MySQLdb
mydb = MySQLdb.connect(host = 'localhost', user = 'root', passwd = '******', db = 'menu')
cur = mydb.cursor()
cur.execute('select * from fish')
results = cur.fetchall()
mydb.close()
for record in results:
    print record[0] , "-->", record[1] , "@", record[2], "each"

afan@ubuntu:~/python-code$ python query.py
1 --> catfish @ 8.50 each
2 --> catfish @ 8.50 each
3 --> tuna @ 8.00 each
4 --> catfish @ 5.00 each
5 --> bass @ 6.75 each
6 --> haddock @ 6.50 each
7 --> salmon @ 9.50 each
8 --> trout @ 6.00 each
9 --> tuna @ 7.50 each
10 --> yellowfin tuna @ 12.00 each
11 --> yellowfin tuna @ 13.00 each
12 --> tuna @ 7.50 each
 Posted by at 5:39 下午
5月 042011
 
实现groupby
#!/usr/bin/python
from itertools import groupby
from operator import itemgetter

def summary(data, key=itemgetter(0), value=itemgetter(1)):
    """Summarise the supplied data.

       Produce a summary of the data, grouped by the given key (default: the
       first item), and giving totals of the given value (default: the second
       item).

       The key and value arguments should be functions which, given a data
       record, return the relevant value.
    """

    for k, group in groupby(data, key):
        yield (k, sum(value(row) for row in group))

if __name__ == "__main__":
    # Example: given a set of sales data for city within region,
    # produce a sales report by region
    sales = [('Scotland', 'Edinburgh', 20000),
             ('Scotland', 'Glasgow', 12500),
             ('Wales', 'Cardiff', 29700),
             ('Wales', 'Bangor', 12800),
             ('England', 'London', 90000),
             ('England', 'Manchester', 45600),
             ('England', 'Liverpool', 29700)]

    for region, total in summary(sales, key=itemgetter(0), value=itemgetter(2)):
        print "s: %d" % (region, total)

$python groupby.py
  Scotland: 32500
     Wales: 42500
   England: 165300

实现多key groupby
#!/usr/bin/python
from itertools import groupby
from operator import itemgetter

def set_keys(*indices):
    """Returns a function that returns a tuple of key values"""
    def get_keys(seq, indices=indices):
        keys = []
        for i in indices:
            keys.append(seq[i])
        return tuple(keys)
    return get_keys
   

def summary(data, key=itemgetter(0), value=itemgetter(1)):
    """Summarise the supplied data.

       Produce a summary of the data, grouped by the given key (default: the
       first item), and giving totals of the given value (default: the second
       item).

       The key and value arguments should be functions which, given a data
       record, return the relevant value.
    """

    for k, group in groupby(data, key):
        yield (k, sum(value(row) for row in group))

if __name__ == "__main__":
    # Example: given a set of sales data for city within region,
    # produce a sales report by region
    sales = [('Scotland', 'Edinburgh', 'Branch1', 20000),
             ('Scotland', 'Glasgow', 'Branch1', 12500),
             ('Scotland', 'Glasgow', 'Branch2', 12000),
             ('Wales', 'Cardiff', 'Branch1', 29700),
             ('Wales', 'Cardiff', 'Branch2', 30000),
             ('Wales', 'Bangor', 'Branch1', 12800),
             ('England', 'London', 'Branch1', 90000),
             ('England', 'London', 'Branch2', 80000),
             ('England', 'London', 'Branch3', 70000),
             ('England', 'Manchester', 'Branch1', 45600),
             ('England', 'Manchester', 'Branch2', 50000),
             ('England', 'Liverpool', 'Branch1', 29700),
             ('England', 'Liverpool', 'Branch2', 25000)]

    sales.sort()
    for (region, city), total in summary(sales, key=set_keys(0,1), value=itemgetter(3)):
        print "%-10s  %-10s : �" % (region, city, total)

$python mgroupby.py
England     Liverpool  :    54700
England     London     :   240000
England     Manchester :    95600
Scotland    Edinburgh  :    20000
Scotland    Glasgow    :    24500
Wales       Bangor     :    12800
Wales       Cardiff    :    59700


from:
http://code.activestate.com
 Posted by at 10:41 上午
2月 162011
 
Many occasions, data needs to be normalized to speed up query operations before entering a database. Interestingly, SAS’s data step array cannot accept mixed data types including both numeric and character ones, which makes it unusable for such a purpose. However, SAS’s procedures provide handy tools to automate this process without referring to any loop. Large text files have to depend on Python, given its’ excellent memory management.

The examples below show how to transform a tiny SASHELP.CLASS dataset into a normalized and digestible format, by either SAS or Python.

****************(1) USING PROCEDURES FROM SAS***************************;
proc import datafile = "c:\tmp\class.csv" out = class replace dbms = csv;
run;

proc transpose data = class out = normalized;
   by name;
   var sex -- weight;
run;

proc export data = normalized outfile = "c:\tmp\normalized_sas.txt" 
   dbms = dlm replace;
   putnames = no; 
   delimiter = " ";
run;

################(2) USING LOOPS FROM PYTHON###############################
new_file = {}
with open('c:/tmp/class.csv', 'r') as myfile:
  varname = myfile.readline().strip().split(',')
  varname.pop(0)
  name = []
  for each_line in myfile:
    row = each_line.strip().split(',')
    this_name = row.pop(0)
    inner_dict = {}
    for i in range(len(row)):
      inner_dict[varname[i]] = row[i]
  new_file[this_name] = inner_dict
  name.append(this_name)
myfile.close()

out_file = open('c:/tmp/normalized_python.txt', 'w')
for that_name in name:
    for that_var in varname:
        print(that_name, that_var, new_file[that_name][that_var], 
        file = out_file)
out_file.close()
 
2月 162011
 
Many occasions, data needs to be normalized to speed up query operations before entering a database. Interestingly, SAS’s data step array cannot accept mixed data types including both numeric and character ones, which makes it unusable for such a purpose. However, SAS’s procedures provide handy tools to automate this process without referring to any loop. Large text files have to depend on Python, given its’ excellent memory management.

The examples below show how to transform a tiny SASHELP.CLASS dataset into a normalized and digestible format, by either SAS or Python.

****************(1) USING PROCEDURES FROM SAS***************************;
proc import datafile = "c:\tmp\class.csv" out = class replace dbms = csv;
run;

proc transpose data = class out = normalized;
   by name;
   var sex -- weight;
run;

proc export data = normalized outfile = "c:\tmp\normalized_sas.txt" 
   dbms = dlm replace;
   putnames = no; 
   delimiter = " ";
run;

################(2) USING LOOPS FROM PYTHON###############################
new_file = {}
with open('c:/tmp/class.csv', 'r') as myfile:
  varname = myfile.readline().strip().split(',')
  varname.pop(0)
  name = []
  for each_line in myfile:
    row = each_line.strip().split(',')
    this_name = row.pop(0)
    inner_dict = {}
    for i in range(len(row)):
      inner_dict[varname[i]] = row[i]
  new_file[this_name] = inner_dict
  name.append(this_name)
myfile.close()

out_file = open('c:/tmp/normalized_python.txt', 'w')
for that_name in name:
    for that_var in varname:
        print(that_name, that_var, new_file[that_name][that_var], 
        file = out_file)
out_file.close()
 
1月 222011
 

Calculate the factorial of 171 (171!)? Just TRY! It is equal to 171*170*169*….2*1.

1. Google calculator

As Google fanatics, I first try to search the answer via Google:

Google171

Whoops, nothing interested returned! Type “170!” and get the output:

Google170 Why kinda things happened in this calculator? 171! is just equal to 171*170!.

2. Excel

Switch to Excel spreadsheet. Function fact(*) used:

Excel170 Excel171 Oo, interesting. The same.

3. SAS

Google and Excel may be the niche players in calculators’ family. Why not try to use some programming languages?

As a SAS programmer, my handy tool is SAS of course.

First, I use SAS data step with its build-in function fact(*):

data _null_;
    x=fact(170);
    y=fact(171);
    put x= y=;
run;

and I get

NOTE: Invalid argument to function FACT at line 49 column 7.
x=7.257416E306 y=.
x=7.257416E306 y=. _ERROR_=1 _N_=1
NOTE: Mathematical operations could not be performed at the following places. The results of the operations have been set to missing values.

Expected or unexpected? I don’t know how this fact(*) function is defined, and  try to define a function to calculate the factorials by myself. In SAS 9.2, you can use PROC FCMP(also available at 9.1.3 as a experimental procedure):

proc fcmp outlib = work.funcs.math ;
    function factorial(k) ;
        if k = 0 then return(1) ;
        z = k ; *preserve k ;
        x = factorial(k-1) ;
        k = z ; *recover k ;
        k = k * x ;
        return(k) ;
    endsub ;
quit ;

options cmplib=work.funcs ;

Use this self-defined function to get 170!

proc fcmp ;
    x = factorial (170) ;
    put x = ;
run ;

The FCMP procedure returns

x=7.257416E306

Try to calculate 171! ?

proc fcmp ;
    y = factorial (171) ;
    put y = ;
run ;

Just get the overflow error. The interaction stops at 170!:

ERROR: An overflow occurred during execution in function ‘factorial’ in statement number 7 at   line 10 column 1.
       The statement was:
    1      (10:1)    k = (k=171) * (x=7.257416E306)

The above function definitions use recursion. Recursion may have some limitation on efficiency. We could try the loop without recursion. SAS/IML doesn’t support recursion. Let SAS/IML to the court:

proc iml;
    start factorial (n);
        fact=1;
        do i=1 to n;
        fact=fact*i;
        end;
        return (fact);
    finish factorial;

    x= factorial (170);
    print x;

    y= factorial (171);
    print y;
quit;

Again, I get 170!

    x
7.257E306

and a overflow error for 171!

            y= factorial (171);
ERROR: Overflow error in *.

Turing, Von Neumann and Tony, what happened?

4. R

When SAS failed, lots of voices pop up: use R! OK, Rction!

> x=factorial(170);x
[1] 7.257416e+306
> y=factorial(171);y
Warning message:
In factorial(171) : value out of range in ‘gammafn’
[1] Inf

5. C++

I don’t want to lose my patience. Think C++(use both recursive and non-recursive methods):

#include <iostream>
using namespace std;

double factRecursive(double num);
double factNonRecursive(double num);

int main()
{
    cout<<endl;   
    cout<<"Recursive: the factorial of 170 is "<<factRecursive(170)<<endl;
    cout<<"Recursive: the factorial of 171 is "<<factRecursive(171)<<endl;
    cout<<endl;  

    cout<<"NonRecursive: the factorial of 170 is "<<factNonRecursive(170)<<endl;
    cout<<"NonRecursive: the factorial of 171 is "<<factNonRecursive(171)<<endl;
    cout<<endl;   

return 0;
}

double factRecursive (double num)
{
    if (num==0)
        return 1;
    else
        return num*factRecursive(num-1);
}

double factNonRecursive (double num)
{
    double fact=1;
    for (double i=2;i<=num;i++) fact *=i;
    return fact;
}

Unfortunately, same story once more:

Cpp

Well. The story’s played out like this. It may be not the limitable of the language but the machine. I check which is the largest numbers my computer supports:

#include <iostream>
#include <cfloat>

using namespace std;

int main()
{
  cout<<"maxinum double value of machine: "<<DBL_MAX<<endl;
  return 0;
}

maxinum double value of machine: 1.79769e+308

Now everything’s in the open. The factorial of 170 is about 7.257416e+306. 171! is too big to be supported by my PC.

(Note: I put these codes in http://codepad.org, a online complier. if you don’t have any C++ complier in your machine, you can see the codes and outputs in:http://codepad.org/xnneavsw  and http://codepad.org/3FeEC9t2)

6. WolframAlpha

Struggled for hours, I turn to WolframAlpha computing platform. It returns the factorial of 171 AT LAST:

WA171 WA171_s  AT LAST we know the factorial of 171 has 310 digits.

7. Windows Calculator

I try to use Windows build-in calculator. Amazing, it is powerful:

winCalc

8. Python

Return to programming language.  First, I defined a function(recursive version) in Python and then use its MATH library:

>>> def factorial(n):

    if n==0:

        return 1

    else:

        return n*factorial(n-1)

>>> factorial(170)

7257415615307998967396728211129263114716991681296451376

5435777989005618434017061578523507492426174595114909912

3783852077666602256544275302532890077320751090240043028

0058295603966612599658257104398558294257568966313439612

2625710949468067112055688804571933402126614528000000000

00000000000000000000000000000000L

>>> factorial(171)

1241018070217667823424840524103103992616605577501693185

3889518036119960752216917529927519781204875855764649595

0167038705280988985869071076733124203221848436431047357

7889968548278290754541561964852153468318044293239598173

6968996572359039476161522785581800611763651084288000000

00000000000000000000000000000000000L

>>> import math

>>> math.factorial(171)

1241018070217667823424840524103103992616605577501693185

3889518036119960752216917529927519781204875855764649595

0167038705280988985869071076733124203221848436431047357

7889968548278290754541561964852153468318044293239598173

6968996572359039476161522785581800611763651084288000000

00000000000000000000000000000000000L

Amazing, Python beats up C++!

(to be continued :

Too Big to Be Accurate(2): Approximation

)

1月 152011
 

The past three years witnessed the rise of R, an open source statistical software. Search R related books in Amazon, and tons of recent titles show up ranging from graphics to scientific computation. Thanks to those graduates sprang out of school that received R training in their statistics major, R starts to appear in some serious business. The basic difference is that license of SAS is sold by SAS Institute, a company with 20k employees, while R is free. In their book ‘SAS and R’, Ken and Nicholas systematically compared the two packages. Even though they carefully avoided the sensitive question that which one is better, the readers can easily make a conclusion that R can do the work equally well as SAS. Then the next question is: why not freebie?

R enjoys many cutting-edge features. First R is a functional language. Writing a function is simple and quick, since the return is always an object. In SAS, implementation of a function is cumbersome, and most SAS programmers use macro instead. Second, in data visualization, R is indispensable, owing to a number of creative packages such as ‘lattice’. Recently SAS strikes back with its SG procedures also based on Trellis' concept of high-level plotting. Even R is more versatile, most cases they look equally good. In data mining, since ‘R is a leading language for developing new statistical methods’ (admitted by SAS when it announced that SAS can call R function in its IML module), the packages available in R are more resourceful than the secretive procedures covered by SAS Enterprise Miner. Name some of them: rpart for decision tree; randomForest for random forests; nnet for neural network; e1071 or kernlab for support vector machines; e1071 for Naive Bayes; earth for multivariate adaptive regression splines; RWeka for boosting. No doubly, any emerging data mining technology can find its counterpart in R. In his new book, Dr. Torgo gave four illustrations using R in data mining for ecology, stock market, fraudulent detection and bioinformatics, separately. Interestingly, biology people and finance people seem least interested in SAS products.

Big data is a curse for R while data mining is always data-intensive. The OOP feature actually backfired on R. Everything, even the raw data, turns out to be an object in memory. No doubly it speeds up the computation, while the side-effect is that memory in an R system tends to overflow easily. The R programmer has to be consistently aware of memory usage. I used to be scared by the noise while SAS reads data from hard disk after submitting codes. But the strategy works even when the data set is larger than the physical memory. As long as it does not freeze, let SAS run. After selling his SPSS to IBM, Dr. Norman Nie, assumed CEO of Revolution Analytics, a commercial provider of R. Dr. Nie’s innovation for R is to introduce the cliché: use an XDF file system in hard disk to store input data. Another distinction between them is that: R reads data by column and SAS reads data by row. In R, the work after reading data includes rows’ spitting and piecing-together. In SAS, data integration is handy with the help of data step’s inherent iteration and numerous unique informats. Another pitfall for R in data management is that it does not support native SQL, while Proc SQL renders SAS an equivalently capable RDBMS as other mainstream RDBMS.

A platform called PCR to compensate R’s shortcoming may be implemented, based on open source software (P: Python to integrate and manipulate data; C: MySQL or SQLite to store and query data; R: R to model and visualize data). Python has peerless capacity in processing complicated data. A database between Python and R avoids the generations of CSV files. Call R function in Python by RSpython(rpy2) or rpy provides other alternatives for direct communication. The underlying principle through PCR is to subset data by scripting or SQL query, and feed R the piece it can absorb. Dr. Janert uses Python partly assisted by R to go through data analysis process in his thoughtful book. Hope next book he would use R more in modeling and explore more in data mining. From my experience, working Python, SQLite and R together is pleasant and productive.

SAS should open its data mining procedures for coding. Many procedures are still under the license of Enterprise Miner, such as Proc Arboretum and Proc SVM. It is difficult to code them like other SAS procedures. SAS is far better than R in data management, while 80% work of data mining usually happens when transforming dirty data to workable data. For R platform, the cost of hiring a qualified worker experienced in Python, SQL and R maybe high. In summary, like the past successful efforts to redeem its reputation in data visualization, SAS should do more for the fast growing data mining market.

References: 1. Luis Torgo. Data Mining with R: Learning with Case Studies. Chapman and Hall, 2010.
2. Ken Kleinman, Nicholas J. Horton. SAS and R: Data Management, Statistical Analysis, and Graphics. Chapman and Hall, 2010.
3. Philipp K. Janert.Data Analysis with Open Source Tools. O'Reilly Media, 2010.

***********SAS'S SHOW**********************;
proc contents data=sashelp.cars position short; run;

proc sgscatter data=sashelp.cars;
matrix MSRP Invoice EngineSize  Horsepower MPG_City MPG_Highway Weight Wheelbase Length
/ellipse=(alpha=0.25) markerattrs=(size=1);
run;

proc export data=sashelp.cars outfile='d:\cars.csv'; run;
############NOW R'S TIME###############
cars <- read.csv("d:/cars.csv")
summary(cars)
attach(cars)
car_matrix <- cor(cbind(MSRP, Invoice, EngineSize, Horsepower,
MPG_City, MPG_Highway,  Weight, Wheelbase, Length))
#CITE SARKAR'S METHODS
panel.corrgram <- function(x, y, z, subscripts, at,
level = 0.9, label = FALSE, ...) 
{ require("ellipse", quietly = TRUE)      
zcol <- level.colors(z, at = at, ...) 
for (i in seq(along = z))       
{ ell <- ellipse(z[i], level = level, npoints = 50,     
scale = c(.2, .2), centre = c(x[i], y[i]))       
panel.polygon(ell, col = zcol[i], border = zcol[i], ...)    }    
if (label) panel.text(x = x, y = y, lab = 100 * round(z, 2), cex = 0.8,       
col = ifelse(z < at="do.breaks(c(-1.01," xlab="NULL," ylab="NULL,"
colorkey="list(space=" col="gray.colors)," scales="list(x=" rot="90)),"
panel="panel.corrgram," label="TRUE)">


12月 082010
 

map函数func作用于给定序列的每个元素,并用一个列表来提供返回值。
map函数python实现代码:

def map(func,seq):
    mapped_seq = []
    for eachItem in seq:
        mapped_seq.append(func(eachItem))
    return mapped_seq
filter函数的功能相当于过滤器。调用一个布尔函数bool_func来迭代遍历每个seq中的元素;返回一个使bool_seq返回值为true的元素的序列。
filter函数python代码实现:

def filter(bool_func,seq):
    filtered_seq = []
    for eachItem in seq:
        if bool_func(eachItem):
            filtered_seq.append(eachItem)
    return filtered_seq


reduce函数,func为二元函数,将func作用于seq序列的元素,每次携带一对(先前的结果以及下一个序列的元素),连续的将现有的结果和下一个值作用在获得的随后的结果上,最后减少我们的序列为一个单一的返回值。
reduct函数python代码实现:

def reduce(bin_func,seq,initial=None):
    lseq = list(seq)
    if initial is None:
        res = lseq.pop(0)
    else:
        res = initial
    for eachItem in lseq:
        res = bin_func(res,eachItem)
    return res


下面是测试的代码

#coding:utf-8

def map_func(lis):
    return lis + 1

def filter_func(li):
    if li % 2 == 0:
        return True
    else:
        return False
        
def reduce_func(li, lis):
    return li + lis
    
li = [1,2,3,4,5]

map_l = map(map_func, li) #将li中所有的数都+1
filter_l = filter(filter_func, li) #得到li中能被2整除的
reduce_l = reduce(reduce_func, li) #1+2+3+4+5

print map_l
print filter_l
print reduce_l

运行结果如下:
[2, 3, 4, 5, 6]
[2, 4]
15
 Posted by at 5:52 下午