#!/usr/bin/python from urlparse import urlparse, urljoin import urllib, sgmllib from HTMLParser import HTMLParser import re, sys class MyParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.num = 0 self.albums = {} def parse(self, url): req = urllib.urlopen(url) self.state = 0 self.raw_text = req.read() self.feed(self.raw_text) def handle_starttag(self, tag, attrs): #print "Encountered the beginning of a %s tag" % tag try: if self.state == 0: if tag == "table" and dict(attrs)["class"] =="olts": self.state = 1 self.row = 0 elif self.state == 1: if tag == "tr": self.row += 1 self.state = 2 elif self.state == 2: if tag == "td": self.state = 3 elif self.state == 3: if tag == "a": self.state = 4 else: self.state = 1 elif self.state == 4: self.state = 1 except KeyError: pass def handle_endtag(self, tag): #print "Encountered the end of a %s tag" % tag if self.state >= 1 and tag == "table": self.state = 0 def handle_data(self, data): if self.state == 3: if self.row > 1: print "%d title"%(self.num), data[:-2], self.num += 1 self.title = data[:-2] elif self.state == 4: if data.strip(): print "album", data try: self.albums[data].append(self.title) except KeyError: self.albums[data] = [self.title] numsongs = int(raw_input("How many songs do you have?")) myparser = MyParser() base = 0 while base < numsongs: current_url = "http://www.douban.com/people/apc/songs?start=%d"%(base) print current_url myparser.parse(current_url) base += 20 print "done, all together %d songs"%(myparser.num) al = myparser.albums.keys()[:] al.sort(cmp= lambda x,y: len(myparser.albums[x]) - len(myparser.albums[y])) for a in al: print a, "%d songs"%(len(myparser.albums[a]))我输出的结果是这两张专辑我标记的最多:
Schindler's List 6 songs
Le Fabuleux destin d'Amélie Poulain 9 songs
辛德勒的名单的OST以及天使爱美丽的OST. 我果然是个电影控.
No comments:
Post a Comment