Giochino di ieri (nato per tutt'altro motivo): inline assembly in Python.
codice:
import tempfile
import subprocess
import ctypes
import os.path
libc = ctypes.CDLL("libc.so.6")
# Some constants
PROT_READ = 1
PROT_WRITE = 2
PROT_EXEC = 4
class ExecutableBuffer:
def __init__(self, size):
# allocate on page boundaries
c_size = ctypes.c_size_t(size)
self.addr = ctypes.c_void_p(libc.valloc(c_size))
self.size = size
if 0 == self.addr:
raise Exception("Failed to allocate memory")
if 0 != libc.mprotect(self.addr, c_size, ctypes.c_int(PROT_READ | PROT_WRITE | PROT_EXEC)):
raise Exception("Failed to set protection on buffer")
def __del__(self):
# free allocated memory
# if libc has already been deleted (=program shutdown)
# ignore the free - the program is terminating anyway
if libc:
libc.free(self.addr)
def asm(fnproto, code):
def doasm(code, opts = []):
with tempfile.NamedTemporaryFile() as ifd:
ifd.write(code)
ifd.flush()
oname = None
with tempfile.NamedTemporaryFile() as ofd:
subprocess.check_call(["nasm", ifd.name, "-o", ofd.name, "-f", "bin"] + opts)
return ofd.read()
# first assemble without any branch size optimization, to obtain the maximum
# size we can expect from the final assembled output
max_code_sz = len(doasm(code, ['-O 0']))
# get an executable buffer of the right size
code_buf = ExecutableBuffer(max_code_sz)
# assemble again, now specifying the correct code origin; this avoids the
# need to write position-independent code
code = doasm("ORG 0x%x\n%s" % (code_buf.addr.value, code), ['-O 2'])
if len(code) > max_code_sz:
raise RuntimeError("Unexpected assembled size: we expected a maximum of %d bytes, got %d bytes" % (max_code, len(code)))
# copy in the executable buffer
ctypes.memmove(code_buf.addr, ctypes.c_char_p(code), len(code))
# create a ctypes function pointer with the required prototype
fn = ctypes.cast(code_buf.addr, fnproto)
# associate the code buffer as an attribute of the function object, tying
# their lifetime together
fn.asm_backing_buffer = code_buf
return fn
fibo = asm(ctypes.CFUNCTYPE(ctypes.c_ulonglong, ctypes.c_ulonglong),"""
bits 64
; maximum fibonacci number in a 64 bit integer without overflow
; used to size the cache
max_64bit_fibo equ 93
start:
; computes the rdi-th fibonacci number (with memoization)
; returns in rax (as per regular SystemV AMD64 ABI)
fibo:
cmp rdi,max_64bit_fibo
jl good
; if the number would overflow anyway, just quit
xor eax,eax
ret
good:
; check if we have it memoized; no need for rip-addressing, as
; asm makes sure to pass the correct origin to nasm
lea rsi,[rdi*8+memo_start]
mov rax,[rsi]
test rax,rax
jnz quit
; we have to calculate; keep the cache address on the stack
; for when we'll store the result
push rsi
; base case: 0 and 1 => 1
cmp rdi,2
mov eax,1
jl quit_memo
; do fibo(input-1)
dec rdi
push rdi
call fibo
pop rdi
; save the result
push rax
; do fibo(input-2)
dec rdi
call fibo
; grab the previous result and sum
pop rbx
add rax,rbx
quit_memo:
; quit and memoize; pick up the cache address we saved above
; and store the result
pop rsi
mov [rsi],rax
quit:
ret
memo_start:
; reserved memory for the memoization cache; starts at 0
; notice that we have RWX permissions on this block of memory
; (unlike regular .text segments), so we can just use some space
; here as RW cache
times max_64bit_fibo dq 0
""")
while True:
print fibo(int(raw_input()))
Finalmente risolti tutti i problemi di performance di Python.
(per provarlo, richiede una macchina Linux/OSX a 64 bit e nasm installato)