#!/usr/bin/python
from urlparse import urlparse, urljoin
import urllib, sgmllib
from HTMLParser import HTMLParser
import re, sys
class MyParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.num = 0
self.albums = {}
def parse(self, url):
req = urllib.urlopen(url)
self.state = 0
self.raw_text = req.read()
self.feed(self.raw_text)
def handle_starttag(self, tag, attrs):
#print "Encountered the beginning of a %s tag" % tag
try:
if self.state == 0:
if tag == "table" and dict(attrs)["class"] =="olts":
self.state = 1
self.row = 0
elif self.state == 1:
if tag == "tr":
self.row += 1
self.state = 2
elif self.state == 2:
if tag == "td":
self.state = 3
elif self.state == 3:
if tag == "a":
self.state = 4
else:
self.state = 1
elif self.state == 4:
self.state = 1
except KeyError:
pass
def handle_endtag(self, tag):
#print "Encountered the end of a %s tag" % tag
if self.state >= 1 and tag == "table":
self.state = 0
def handle_data(self, data):
if self.state == 3:
if self.row > 1:
print "%d title"%(self.num), data[:-2],
self.num += 1
self.title = data[:-2]
elif self.state == 4:
if data.strip():
print "album", data
try:
self.albums[data].append(self.title)
except KeyError:
self.albums[data] = [self.title]
numsongs = int(raw_input("How many songs do you have?"))
myparser = MyParser()
base = 0
while base < numsongs:
current_url = "http://www.douban.com/people/apc/songs?start=%d"%(base)
print current_url
myparser.parse(current_url)
base += 20
print "done, all together %d songs"%(myparser.num)
al = myparser.albums.keys()[:]
al.sort(cmp= lambda x,y: len(myparser.albums[x]) - len(myparser.albums[y]))
for a in al:
print a, "%d songs"%(len(myparser.albums[a]))我输出的结果是这两张专辑我标记的最多:Schindler's List 6 songs
Le Fabuleux destin d'Amélie Poulain 9 songs
辛德勒的名单的OST以及天使爱美丽的OST. 我果然是个电影控.
No comments:
Post a Comment