Blake O'Hare .com

Asynchronous HTTP Requests in Python

Tag: Python
I found myself in the position of being disappointed with the lack of ability to make asynchronous HTTP requests in Python twice without installing (and forcing users to install) some silly library, which means it's time to just roll my own.

This is one file. Just drop it into your codebase and import it as a module. No installations. It uses Python's native urllib and threading modules. It works in both Python 2 and Python 3.

Particularly this is useful for things like games or other UI apps where blocking a thread is unacceptable.

Sample Usage

import fetcher # or whatever you name the file
import time

request = fetcher.HttpAsyncRequest('http://www.blakeohare.com/blog/')

while not request.is_complete():
  print('Waiting...')
  time.sleep(.05)

print(request.get_response_code(), request.get_response_message())
print('-' * 10)
print(request.get_response_content('t'))


The Code

# This module makes asynchronous HTTP requests in Python.
# This runs in Python 2.x* and 3.x
# This requires no special library to be installed.
# This is public domain.
# This is provided as-is.
# Go nuts.

# * HTTP header names in the response are all lowercase in Python 2.x. This seems to be
#   a limitation in urllib. Nothing I can do about this other than fall back to TCP/IP and 
#   parse the response manually. Although if I'm incorrect in assuming this, please let 
#   me know.

_user_agent = "Blake's Magic Python Async HTTP Fetcher vee one point oh"

import threading as _threading
_is_old = 3 / 2 == 1 # Yeah, I'm sure there's a better way. Deal with it.
if _is_old:
    import urllib as _urllib
    import urllib2 as _urllib2
    import urlparse as _urlparse
else:
    import urllib as _urllib
    import urllib.parse as _urllib_parse
    import urllib.request as _urllib_request
    
def _parse_url(url):
    return _urlparse.urlparse(url)

def set_user_agent(value):
    global _user_agent
    _user_agent = value

def decode_url_value(value):
    if _is_old:
        return _urllib.unquote(value).decode('utf8')
    else:
        return _urllib_parse.unquote(value)

def encode_url_value(value):
    if _is_old:
        return _urllib2.quote(value.encode('utf8'))
    else:
        return _urllib_parse.quote(value)

def _send_impl(req_obj, method, url, headers, content):
    if _is_old:
        opener = _urllib2.build_opener(_urllib2.HTTPHandler)
        if content == None:
            request = _urllib2.Request(url)
        else:
            request = _urllib2.Request(url, data=content)
    else:
        opener = _urllib_request.build_opener(_urllib_request.HTTPHandler)
        if content == None:
            request = _urllib_request.Request(url)
        else:
            request = _urllib_request.Request(url, data=content)
    for header in headers:
        request.add_header(header[0], header[1])
    request.get_method = lambda:method
    output = opener.open(request)
    content = output.read()
    headers = {}
    for header_key in output.headers.keys():
        headers[header_key] = output.headers[header_key]
    response_message = output.msg
    response_code = output.code
    req_obj._set_result(response_code, response_message, content, headers)

class HttpAsyncRequest:
    def __init__(self, url):
        bad_format = False
        try:
            if _is_old:
                url_parts = _parse_url(url)
            else:
                url_parts = _urllib_parse.urlparse(url)
            if url_parts.scheme == '' or url_parts.netloc == '':
                bad_format = True
        except:
            bad_format = True
        if bad_format:
            raise Exception("Bad URL! Bad!")
            
        self.mutex = _threading.Lock()
        self.method = 'GET'
        self.scheme = url_parts.scheme
        self.host = url_parts.hostname
        self.port = url_parts.port
        self.path = url_parts.path
        self.fragment = url_parts.fragment
        self.params = url_parts.params
        self.original_query = url_parts.query # use this if query params are not modified
        self.query = None # if modified, clear original_query and populate this with a dictionary lookup
        self.header_formatting = {} # preserves the formatting of the header key
        self.header_values = {} # canonical key of header with list of values of that header
        self.content = None
        self.set_header('User-Agent', _user_agent)
        self.done = False
        self.response_code = -1
        self.response_message = None
        self.response_content = None
        self.response_headers_values = None
        self.response_headers_formatting = None
    
    def send(self):
        url = self.scheme + '://' + self.host
        
        if self.port != None:
            url += ':' + str(self.port)
        
        if self.path != None and self.path != '':
            if self.path[0] != '/':
                self.path = '/' + self.path
            url += self.path
        
        if self.params != None and self.params != '':
            url += ';' + self.params
        
        if self.query == None:
            if self.original_query != '':
                url += '?' + self.original_query
        else:
            queries = []
            keys = self.query.keys()[:]
            keys.sort() # deterministic requests
            for key in keys:
                e_key = encode_url_value(key)
                for value in self.query[key]:
                    e_value = encode_url_value(value)
                    queries.append(e_key + '=' + e_value)
            url += '?' + '&'.join(queries)
        
        if self.fragment != '':
            url += '#' + self.fragment
        
        headers = []
        keys = list(self.header_formatting.keys())
        keys.sort()
        for key in keys:
            f_key = self.header_formatting[key]
            for value in self.header_values[key]:
                headers.append((f_key, value))
        
        
        thread = _threading.Thread(target = _send_impl, args = (self, self.method, url, headers, self.content))
        thread.daemon = True
        thread.start()
        
    def _set_result(self, code, message, content, headers):
        self.mutex.acquire()
        try:
            self.response_code = code
            self.response_message = message
            self.response_content = content
            self.response_headers_values = {}
            self.response_headers_formatting = {}
            for key in headers.keys():
                ckey = key.lower()
                self.response_headers_values[ckey] = headers[key]
                self.response_headers_formatting[ckey] = key
        finally:
            self.mutex.release()
    
    def is_complete(self):
        self.mutex.acquire()
        try:
            return self.response_code != -1
        finally:
            self.mutex.release()
    
    def _ensure_request_complete(self):
        if not self.is_complete():
            raise Exception("Cannot access response until request is complete.")
    
    def get_response_code(self):
        self._ensure_request_complete()
        return self.response_code
    
    def get_response_message(self):
        self._ensure_request_complete()
        return self.response_message
    
    def get_response_header_names(self):
        self._ensure_request_complete()
        output = list(self.response_headers_formatting.values())
        output.sort()
        return output
    
    def get_response_header(self, name):
        self._ensure_request_complete()
        return self.response_headers_values.get(name.lower(), None)
        
    def get_response_content(self, mode='t'):
        self._ensure_request_complete()
        output = self.response_content
        if mode == 't':
            return output.decode('utf-8')
        else:
            return output
    
    
    def set_header(self, key, value):
        self.header_formatting[key.lower()] = key
        self.header_values[key.lower()] = [value]
    
    def add_header(self, key, value):
        canonical_key = key.lower()
        existing_headers = self.header_values.get(canonical_key, None)
        if existing_headers == None:
            self.set_header(key, value)
        else:
            existing_headers.append(value)
    
    def clear_header(self, key):
        canonical_key = key.lower()
        if self.header_values.get(canonical_key, None) != None:
            self.header_values.pop(canonical_key)
            self.header_formatting.pop(canonical_key)
    
    def set_method(self, method):
        self.method = method
    
    def set_content(self, content):
        self.content = content
    
    def _init_query(self):
        if self.query == None:
            query = [] if self.original_query != '' else self.original_query.split('&')
            lookup_values = {}
            for item in query:
                parts = item.split('=')
                if len(parts) >= 2:
                    item_key = decode_url_value(parts[0])
                    item_value = decode_url_value('='.join(parts[1:]))
                    existing_values = lookup_values.get(item_key, None)
                    if existing_values == None:
                        existing_values = []
                        lookup_values[item_key] = existing_values
                    existing_values.append(item_value)
            self.query = lookup_values
    
    def set_query(self, key, value):
        self._init_query()
        self.query[key] = [value]
    
    def add_query(self, key, value):
        self._init_query()
        values = self.query.get(key, None)
        if values != None:
            values.append(value)
        else:
            self.query[key] = [value]
    
    def clear_query(self, key):
        self._init_query()
        if self.query.get(key, None) != None:
            self.query.pop(key)
    
    def set_port(self, port):
        self.port = port
    
    def set_fragment(self, fragment):
        self.fragment = fragment
    
    def clear_fragment(self):
        self.fragment = None
    
    def set_scehem(self, scheme):
        self.scheme = scheme


Documentation

  • HttpAsyncRequest(url) - creates a new Aysnchronous HTTP request object, but does not send the request yet so you have a chance to fiddle with it before sending.
  • asyncRequest.send() - sends the HTTP request and returns immediately. (See asyncRequest.is_complete())
  • asyncRequest.set_method(method) - set the method type to GET, POST, PUT, DELETE, HAMSTERS, or whatever.
  • asyncRequest.set_header(name, value) - set an HTTP header
  • asyncRequest.add_header(name, value) - add an HTTP header (yes, HTTP requests can have multiple headers with the same name)
  • asyncRequest.clear_header(name) - clears all values of an HTTP header
  • asyncRequest.set_query(name, value) - sets a URL query value (overrides the constructor value)
  • asyncRequest.add_query(name, value) - adds a URL query value (overrides the constructor value, HTTP allows for multiple values for the same name).
  • asyncRequest.clear_query(name, vaue) - clears all query values for a particular name (overrides the constructor value).
  • asyncRequest.set_content(value) - set the HTTP request body.
  • asyncRequest.set_port(port) - set the port (overrides the constructor value).
  • asyncRequest.set_fragment(value) - sets the fragment i.e. the thing after a # (overrides the constructor value).
  • asyncRequest.set_scheme(value) - sets the scheme (overrides the constructor value).
  • asyncRequest.is_complete() - returns True if the request is finished.
  • asyncRequest.get_response_code() - returns the HTTP status code of the response.
  • asyncRequest.get_response_message() - returns the HTTP status message of the response e.g. OK, NOT FOUND, FORBIDDEN, BEEEEES, etc.
  • asyncRequest.get_response_headers() - returns the list of all the response header names.
  • asyncRequest.get_response_header(name) - returns the value of a response header.
  • asyncRequest.get_response_content(mode='t') - returns the content of the response. mode can be 't' for text or 'b' for binary.

There are some module functions:
  • set_user_agent(value) - Globally sets the User-Agent header for all subsequent requests.
  • decode_url_value(value) - Decodes URL values (all the '%40' and '+' silliness) using UTF8. This (and the encode method below) just wrap the relevant Python 2 or 3 urllib libraries so you don't have to worry about compatibility yourself.
  • encode_url_value(value) - Encodes URL values into escaped hex values using UTF8.

PyGame Tips & Tricks

PyWeek 15 starts in one week. So here's a few tips and tricks for a more lovely pygame coding experience.

Image Cache

Use a string->Surface dictionary as an image cache. Wrap this cache with a function called get_image(path). This path will be the key for the cache. If the image isn't found in the cache, load the image from file, but convert the path delimiters. i.e. real_path = path.replace('/', os.sep).replace('\\', os.sep). This way you can just use slashes or backslashes everywhere in your code without cluttering it with a bunch of unsightly os.path.join's.

Caps in file names

If you do your primary development on Windows, adopt a consistent convention for using caps in filenames. Ideally DO NOT USE CAPS AT ALL. Make all files lowercase. All other major OS's have case-sensitive file paths. Be mindful of this.

Use Spriting

There is a HUGE overhead to loading an image from a hard drive. Loading a giant image is only slightly slower than loading a tiny image. If you have billions of tiny images, write a script that combines all the images into one giant image and generates a manifest that describes where each file is on this giant image and its size. In your game, add code to your image cache function that blits this (cached) giant image onto a small empty surface that is the size of the image you want. This way you can load your billion tiny images with only calling pygame.image.load once.

(For this example, assume there is some sort of manifest data structure that is keyed off the filename and contains a position and size field for each file. The implementation of such a datastructure should be trivial.)

_images = {}
_img_manifest = read_manifest('image_manifest.txt')
_sprite_sheet = None
def get_sprite_sheet():
  global _sprite_sheet
  if _sprite_sheet == None:
    _sprite_sheet = pygame.image.load('sprite_sheet.png')
  return _sprite_sheet

def get_image(path):
  img = _images.get(path)
  if img == None:
    img_data = _img_manifest.get_image_data(path.replace('/', os.sep).replace('\\', os.sep))
    position = img_data.position
    size = img_data.size
    img = pygame.Surface(size)
    img.blit(_sprite_sheet, (-position[0], -position[1]))
    _images[path] = img
  return img


One Loop to Rule them All

Only write one game loop. Each logical scene should be an object.

Abstract Raw Input

In this single game loop, abstract the raw input from the framework into logical input that is relevant for your game. Convert the pygame events into MyEvent, a class you create. This class will have event names such as "left", "right", "jump", "shoot" instead of K_LEFT, K_RIGHT, K_UP, K_SPACE. This mapping ought to occur based on some sort of pygame event -> custom event dictionary. This gives you the option of later creating an input configuration menu where the user can edit these values. The rest of your code should be completely unaware of the notion of pygame events. Only logical events.

Joystick

Here be monsters.

Music

Through the complexity of your game and unique ways to hit certain code, it's a common error to get into a situation where the wrong music is playing because the user somehow bypassed a crucial mixer.music.play call. For each logical scene in your game, write a function called ensure_playing(song) that gets called EVERY frame. This function should maintain a state of what song is currently playing and no-op if the input matches that. If not, switch songs.

Abstract the complexities of creating and caching text

Write a function called get_text that takes in the text, font name, color, size, etc. This should return an image that matches this criteria from either a cache or generate it if not present in the cache. This cache should be keyed off the inputs of the function. For example, if you use a *args as the inputs, you can use this tuple directly as the key. Or construct the tuple manually from rigid arguments. If you have a game with lots of dynamic text, clear this cache periodically.

For Loop Bad

Use while loops instead of for loops when iterating through a simple range of numbers. The range function wastes a ton of memory. The xrange function isn't that great either since it's wasted time to call function, push stack info, etc. A simple while loop is extremely fast by comparison.

Update: So I was totally wrong about this. See the wonderful investigation Omni did in the comments section.
Basically, I projected my experience in other languages to Python where iterators use a tad more CPU than a simple integer-incrementing-loop and made a false assumption. Python (both 2.x and 3.x) are smart enough to optimize range out and basically give you the power of a highly optimized loop. Just be sure to use xrange in 2.x.


Don't reblit identical information each frame

If you know something is guaranteed to look the same each frame, then composite multiple blits into one image that's cached and blit that one image. There is quite a bit of overhead to blit. This is especially useful if the blitted region has a number of overlapping blits or if there's complexity to generate the information that needs to be blitted.

Lists Declared Inline Kill Kittens

Do not declare lists or tuples inline in code unless you really need to create a new, separate instance. Declare them once in some sort of initialization function and refer to it that way.

Use Source Control

Even if you're working alone, source control has benefits. It saves the state of your code if when you screw it up and need to revert back to a previous version. For me, personally, it helps me focus on one feature at a time without leaving something hanging with a TODO as I am more mindful when I have to submit complete changelists.

3.x is not your enemy

Embrace 3.x. PyGame tends to run noticeably faster on it. Seriously. And whether or not you plan on using 2.x till the day you die for principled reasons, over time people will be switching to 3.x as their default, and non-3.x compatible code will stop working. Embracing 3.x doesn't have to mean turning your back to 2.x users. For a typical PyGame game, there are only a couple simple things you have to dance around. And it's a fairly simple dance, too:

Python 3.x compatibility Tips

  • Write legacy_map, legacy_filter, etc. that re-implements the behvior of the 2.x versions of map and filter, etc. if those are functions you even use. You could also do the same with range and create a legacy_range, but as stated earlier, you really shouldn't use [x]range.
  • Put parenthesis around all print statements.
  • Use // for integer division, and add 0.0 to ints if you intend for float division.
  • Don't throw custom exceptions. If exceptions are occuring as part of your intended codepath in a game, you are probably doing something wrong anyway.
  • Install Python 2.5, 2.6, 2.7, 3.1, and 3.2. Create execution scripts (.sh/.bat) that runs your game using these versions. Call them run26.bat, run32.bat, etc. Before you check in code, run your game using a 2.x version and 3.x version. This is also especially useful for ensuring you don't have any stray print statements you were using for debugging. If you write debug print statements without parenthesis and accidentally leave it there, the 3.x version will give a syntax error if you try to run it.

JavaScript Tutorial, Part 5 - Interacting with HTML

What good is JavaScript if you can't interact with the HTML page?

Modify your index.html to look like this:
<html>
    <head>
        <title>JavaScript tutorial</title>
        <script type="text/javascript" src="code.js"></script>
    </head>
    <body>
        <input type="text" id="typey_box" />
        <button onclick="do_things()">Click</button>
        <div id="output">Results will appear here.</div>
    </body>
</html>
onclick is a JavaScript event. I will talk about this more later. Just nod and smile for now.

Modify your code.js file to look like this:
function do_things()
{
    var text_box = document.getElementById('typey_box');
    var results_box = document.getElementById('output');
    var text = text_box.value;
    var message = "string length is " + text.length;
    results_box.innerHTML = message;
}
do_things is a user-defined function. I will talk about this more later. Just nod and smile for now.

Your page will look something like this when you launch it.
js_tutorial_10.png

Type something in the box and click the button:
js_tutorial_11.png

Aside from the event and the function definition, two nifty new things happen here.

Getting an element from the page

Here I use the getElementById which exists on the document object. Any HTML element that has its id="..." attribute set, can be accessed by calling this function. The DOM object is stored into the variable and you can do neat things with it. Most importantly...

Reading/Modifying the contents of a DOM object

As you can see there is a handy-dandy property on the DOM object called innerHTML. Reading this property will return a string of the HTML code INSIDE the element. So for example, if you use document.getElementById('foo') to get an element that looks like this:
<div id="foo">Hello</div>

then calling innerHTML on the foo element will return Hello and not <div id="foo">Hello</div>.
Setting it will change the value of Hello on the visible page.

This finally liberates us from being forced to use the window.alert function to get any output from the code. Moreover, it allows you to actually do useful things on the page itself.

function do_things()
{
    var text_box = document.getElementById('typey_box');
    var results_box = document.getElementById('output');
    var text = text_box.value;
    var message = "string length is " + text.length;
    results_box.innerHTML = message;
    results_box.innerHTML = '<div style="color:#f00;">' + results_box.innerHTML + '</div>';
}
This will take the text in results_box and wrap it in div that makes the font red. If you were to set the value of innerHTML, be aware that it will include the div itself, not just the text. Suppose you wanted to just modify the text inside the red div. Then you can add an additional ID to the div and requery for that div, then modify the innerHTML of that...
function do_things()
{
    var text_box = document.getElementById('typey_box');
    var results_box = document.getElementById('output');
    var text = text_box.value;
    var message = "string length is " + text.length;
    results_box.innerHTML = message;
    results_box.innerHTML = '<div id="inner_div" style="color:#f00;">' + results_box.innerHTML + '</div>';
    var red_div = document.getElementById('inner_div');
    red_div.innerHTML = "New Text that appears as red.";
}


As a side note, you may notice that I mix and match double and single quotation marks to indicate strings. They are completely equivalent. As a general rule of thumb, I use double-quotes for written text and single-quotes for HTML code. This is because you are likely to encounter apostrophes in written text and those conflict with single-quote-delimited strings. In HTML, double-quotes are common, so I use the single quotes to delimit them.
Go back further...