diff --git a/data/interfaces/default/config.html b/data/interfaces/default/config.html index 2f1f5619..bb5ca038 100644 --- a/data/interfaces/default/config.html +++ b/data/interfaces/default/config.html @@ -311,7 +311,7 @@ m<%inherit file="base.html"/>
- +
diff --git a/lib/whatapi.py b/lib/whatapi.py index bc6f4394..b6f1b956 100755 --- a/lib/whatapi.py +++ b/lib/whatapi.py @@ -20,15 +20,15 @@ ################################################################################# -__author__="devilcius" -__date__ ="$Oct 23, 2010 11:21:12 PM$" +__author__ = "devilcius" +__date__ = "$Oct 23, 2010 11:21:12 PM$" import hashlib try: from BeautifulSoup import BeautifulSoup except: - raise ImportError,"Please install BeautifulSoup 3.2 module from http://www.crummy.com/software/BeautifulSoup/#Download" + raise ImportError, "Please install BeautifulSoup 3.2 module from http://www.crummy.com/software/BeautifulSoup/#Download" import httplib import os import pickle @@ -149,8 +149,8 @@ class WhatBase(object): print "authenticating..." self.whatcd.headers = Authenticate(self.whatcd).getAuthenticatedHeader() - def _request(self,type, path, data, headers): - return Request(self.whatcd,type,path,data,headers) + def _request(self, type, path, data, headers): + return Request(self.whatcd, type, path, data, headers) def _parser(self): return Parser(self.whatcd) @@ -187,7 +187,7 @@ class Utils(): return text.encode("utf-8") - def _number(self,string): + def _number(self, string): """ Extracts an int from a string. Returns a 0 if None or an empty string was passed """ @@ -222,101 +222,101 @@ class Utils(): class WhatCD(object): - def __init__(self, username, password, site, loginpage, headers): + def __init__(self, username, password, site, loginpage, headers): - #credentials - self.username = username - self.password = password - self.site = site - self.loginpage = loginpage - self.headers = headers - self.authenticateduserinfo = {} + #credentials + self.username = username + self.password = password + self.site = site + self.loginpage = loginpage + self.headers = headers + self.authenticateduserinfo = {} - self.cache_backend = None - self.proxy_enabled = False - self.proxy = None + self.cache_backend = None + self.proxy_enabled = False + self.proxy = None - def isAuthenticated(self): - """ + def isAuthenticated(self): + """ Checks if we are authenticated in what.cd """ - if "id" in self.authenticateduserinfo: - return True - else: - return False + if "id" in self.authenticateduserinfo: + return True + else: + return False - def getCredentials(self): - """ + def getCredentials(self): + """ Returns an authenticated user credentials object """ - return Authenticate(self) + return Authenticate(self) - def getUser(self, username): - """ + def getUser(self, username): + """ Returns an user object """ - return User(username, self) + return User(username, self) - def getTorrent(self, id, page=1): - """ + def getTorrent(self, id, page=1): + """ Returns a torrent object """ - return Torrent(id, page, None, self) + return Torrent(id, page, None, self) - def getTorrentGroup(self, id, page=1): - """ + def getTorrentGroup(self, id, page=1): + """ Returns a torrent object """ - return Torrent(id, page, True, self) + return Torrent(id, page, True, self) - def getArtist(self, name): - """ + def getArtist(self, name): + """ Returns an artist object """ - return Artist(name, self) + return Artist(name, self) - def enableProxy(self, host, port): - """Enable a default web proxy""" - self.proxy = [host, Utils()._number(port)] - self.proxy_enabled = True + def enableProxy(self, host, port): + """Enable a default web proxy""" + self.proxy = [host, Utils()._number(port)] + self.proxy_enabled = True - def disableProxy(self): - """Disable using the web proxy""" - self.proxy_enabled = False + def disableProxy(self): + """Disable using the web proxy""" + self.proxy_enabled = False - def isProxyEnabled(self): - """Returns True if a web proxy is enabled.""" - return self.proxy_enabled + def isProxyEnabled(self): + """Returns True if a web proxy is enabled.""" + return self.proxy_enabled - def getProxy(self): - """Returns proxy details.""" - return self.proxy + def getProxy(self): + """Returns proxy details.""" + return self.proxy - def enableCaching(self, file_path = None): - """Enables caching request-wide for all cachable calls. + def enableCaching(self, file_path=None): + """Enables caching request-wide for all cachable calls. * file_path: A file path for the backend storage file. If None set, a temp file would probably be created, according the backend. """ - if not file_path: - file_path = tempfile.mktemp(prefix="whatapi_tmp_") + if not file_path: + file_path = tempfile.mktemp(prefix="whatapi_tmp_") - self.cache_backend = _ShelfCacheBackend(file_path) + self.cache_backend = _ShelfCacheBackend(file_path) - def disableCaching(self): - """Disables all caching features.""" - self.cache_backend = None + def disableCaching(self): + """Disables all caching features.""" + self.cache_backend = None - def isCachingEnabled(self): - """Returns True if caching is enabled.""" + def isCachingEnabled(self): + """Returns True if caching is enabled.""" - return not (self.cache_backend == None) + return not (self.cache_backend == None) - def getCacheBackend(self): + def getCacheBackend(self): - return self.cache_backend + return self.cache_backend -def getWhatcdNetwork(username = "", password = ""): +def getWhatcdNetwork(username="", password=""): """ Returns a preconfigured WhatCD object for what.cd # Parameters: @@ -325,21 +325,21 @@ def getWhatcdNetwork(username = "", password = ""): """ return WhatCD ( - username = username, - password = password, - site = "ssl.what.cd", - loginpage = "/login.php", - headers = { - "Content-type": "application/x-www-form-urlencoded", - 'Accept-Charset': 'utf-8', - 'User-Agent': "whatapi [devilcius]" - }) + username=username, + password=password, + site="ssl.what.cd", + loginpage="/login.php", + headers={ + "Content-type": "application/x-www-form-urlencoded", + 'Accept-Charset': 'utf-8', + 'User-Agent': "whatapi [devilcius]" + }) class _ShelfCacheBackend(object): """Used as a backend for caching cacheable requests.""" - def __init__(self, file_path = None): + def __init__(self, file_path=None): self.shelf = shelve.open(file_path) def getHTML(self, key): @@ -355,7 +355,7 @@ class _ShelfCacheBackend(object): class Request(object): """web service operation.""" - def __init__(self, whatcd,type, path, data, headers): + def __init__(self, whatcd, type, path, data, headers): self.whatcd = whatcd self.utils = Utils() @@ -394,8 +394,8 @@ class Request(object): rb = ResponseBody() if self.whatcd.isProxyEnabled(): - conn = httplib.HTTPSConnection(host = self.whatcd.getProxy()[0], port = self.whatcd.getProxy()[1]) - conn.request(method = self.type, url="https://" + self.whatcd.site + self.path, body = self.data, headers = self.headers) + conn = httplib.HTTPSConnection(host=self.whatcd.getProxy()[0], port=self.whatcd.getProxy()[1]) + conn.request(method=self.type, url="https://" + self.whatcd.site + self.path, body=self.data, headers=self.headers) else: conn.request(self.type, self.path, self.data, self.headers) @@ -406,7 +406,7 @@ class Request(object): conn.close() return rb - def execute(self, cacheable = False): + def execute(self, cacheable=False): """Depending if caching is enabled, returns response from the server or, if available, the cached response""" if self.whatcd.isCachingEnabled() and cacheable: response = self.getCachedResponse() @@ -430,23 +430,23 @@ class Authenticate(WhatBase): def setCookie(self): print "creating cookie" f = open('cookie', 'w') - loginform= {'username': self.whatcd.username, 'password': self.whatcd.password \ - , 'keeplogged': '1', 'login': 'Login'} + loginform = {'username': self.whatcd.username, 'password': self.whatcd.password\ + , 'keeplogged': '1', 'login': 'Login'} data = urllib.urlencode(loginform) response = self._request("POST", self.whatcd.loginpage, data, self.whatcd.headers).execute(True) try: - cookie=dict(response.headers)['set-cookie'] - session=re.search("session=[^;]+", cookie).group(0) + cookie = dict(response.headers)['set-cookie'] + session = re.search("session=[^;]+", cookie).group(0) self.whatcd.headers["Cookie"] = session homepage = response.body pickle.dump(self.whatcd.headers, f) except (KeyError, AttributeError): + print "Login failed, most likely bad creds or the site is down, nothing to do" + f.close() os.remove('cookie') self.whatcd.headers = None -# quit() # Commented out...can't have this quitting headphones on us - raise Exception("Login failed, most likely bad creds or the site is down, nothing to do") - finally: - f.close() +# quit() + f.close() def getAuthenticatedHeader(self): @@ -459,18 +459,19 @@ class Authenticate(WhatBase): try: self.whatcd.headers = pickle.load(f) except EOFError: + f.close() os.remove("cookie") print "invalid cookie, removed" self.setCookie() else: self.setCookie() - #set authenticated user info + #set authenticated user info if 'id' not in self.whatcd.authenticateduserinfo: self.whatcd.authenticateduserinfo = self.getAuthenticatedUserInfo(homepage) return self.whatcd.headers - def getAuthenticatedUserInfo(self, homepage = None): + def getAuthenticatedUserInfo(self, homepage=None): """ Returns authenticated user's info """ @@ -611,11 +612,11 @@ class User(WhatBase): """ if self.userid is None: self.userid = self.getUserId() - url = "/torrents.php?type=seeding&userid=%s&page=%d" % (self.userid,page) + url = "/torrents.php?type=seeding&userid=%s&page=%d" % (self.userid, page) torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) return self._parser().torrentsList(torrentspage) - def getTorrentsSnatched(self,page=1): + def getTorrentsSnatched(self, page=1): """ Returns a list with all user's uploaded music torrents in form of dictionary {page(tuple with current and total),tag, dlurl, id, @@ -624,7 +625,7 @@ class User(WhatBase): """ if self.userid is None: self.userid = self.getUserId() - url = "/torrents.php?type=snatched&userid=%s&page=%d" % (self.userid,page) + url = "/torrents.php?type=snatched&userid=%s&page=%d" % (self.userid, page) torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) return self._parser().torrentsList(torrentspage) @@ -637,7 +638,7 @@ class User(WhatBase): """ if self.userid is None: self.userid = self.getUserId() - url = "/torrents.php?type=uploaded&userid=%s&page=%d" % (self.userid,page) + url = "/torrents.php?type=uploaded&userid=%s&page=%d" % (self.userid, page) torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) return self._parser().torrentsList(torrentspage) @@ -651,7 +652,7 @@ class User(WhatBase): if self.userid is None: self.userid = self.getUserId() - url = "/%s&page=%d" % (self.specificUserInfo().torrentscomments[1],page) + url = "/%s&page=%d" % (self.specificUserInfo().torrentscomments[1], page) torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) return self._parser().postsList(torrentspage) @@ -669,14 +670,14 @@ class User(WhatBase): info = SpecificInformation() # Initialize attributes info.joindate, info.lastseen, info.dataup, info.datadown,\ - info.ratio, info.rratio,info.uppercentile,info.downpercentile, \ - info.torrentsuppercentile,info.reqfilledpercentile,info.bountyspentpercentile, \ - info.postsmadepercentile,info.artistsaddedpercentile,info.overallpercentile, \ - info.postsmadecom,info.torrentscommentscom,info.collagesstartedcom,info.collagescontrcon, \ - info.reqfilledcom,info.reqvotedcom,info.uploadedcom,info.uniquecom, info.perfectcom, \ - info.seedingcom, info.leechingcom,info.snatchedcom,info.invitedcom,info.artistsaddedcom \ - = (None,None, None, None,None,None,None,None,None,None,None,None,None, None,\ - None,None,None,None,None,None,None,None,None,None,None,None,None,None) + info.ratio, info.rratio, info.uppercentile, info.downpercentile,\ + info.torrentsuppercentile, info.reqfilledpercentile, info.bountyspentpercentile,\ + info.postsmadepercentile, info.artistsaddedpercentile, info.overallpercentile,\ + info.postsmadecom, info.torrentscommentscom, info.collagesstartedcom, info.collagescontrcon,\ + info.reqfilledcom, info.reqvotedcom, info.uploadedcom, info.uniquecom, info.perfectcom,\ + info.seedingcom, info.leechingcom, info.snatchedcom, info.invitedcom, info.artistsaddedcom\ + = (None, None, None, None, None, None, None, None, None, None, None, None, None, None,\ + None, None, None, None, None, None, None, None, None, None, None, None, None, None) if not self.userinfo and self.getInfo() is None: @@ -686,7 +687,7 @@ class User(WhatBase): info.joindate = self.userinfo['stats']['joined'] info.lastseen = self.userinfo['stats']['lastseen'] info.dataup = self.userinfo['stats']['uploaded'] - info.datadown = self.userinfo['stats']['downloaded'] + info.datadown = self.userinfo['stats']['downloaded'] info.ratio = self.userinfo['stats']['ratio'] info.rratio = self.userinfo['stats']['rratio'] ######## percentile ########### @@ -740,7 +741,7 @@ class Torrent(WhatBase): def getTorrentUrl(self): """ - Returns a dictionnary torrent's real URL + Returns torrent's URL """ if self.isParent: form = {'id': self.id, 'page':self.page} @@ -750,17 +751,25 @@ class Torrent(WhatBase): form = {'torrentid': self.id, 'page':self.page} data = urllib.urlencode(form) headers = self._request("GET", self.torrentpage + data, "", self.whatcd.headers).execute(True).headers + if dict(headers) is None: return None else: - return dict(headers)['location'] + if 'location' not in dict(headers).keys(): + return None + else: + return dict(headers)['location'] def getInfo(self): """ Returns a dictionnary with torrents's info """ - torrentpage = BeautifulSoup(self._request("GET", "/"+self.getTorrentUrl(), "", self.whatcd.headers).execute(True).body) + if self.getTorrentUrl() is None: + print "no torrent retrieved with such id" + return None + + torrentpage = BeautifulSoup(self._request("GET", "/" + self.getTorrentUrl(), "", self.whatcd.headers).execute(True).body) if 'Site log' in torrentpage.find("title").string: print "no torrent retrieved with such id" @@ -929,7 +938,7 @@ class Artist(WhatBase): """ form = {'artistname': self.name} data = urllib.urlencode(form) - headers = self._request("GET", self.artistpage +"?"+ data, "", self.whatcd.headers).execute(True).headers + headers = self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).headers if dict(headers)['location'][0:14] != 'artist.php?id=': return None else: @@ -942,7 +951,7 @@ class Artist(WhatBase): if self.getArtistId(): form = {'id': self.getArtistId()} data = urllib.urlencode(form) - artistpage = BeautifulSoup(self._request("GET", self.artistpage +"?"+ data, "", self.whatcd.headers).execute(True).body) + artistpage = BeautifulSoup(self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).body) return self._parser().artistInfo(artistpage) else: print "no artist info retrieved" @@ -993,11 +1002,11 @@ class Artist(WhatBase): * info tuple: (The artist's info -str-, image url -str- (None if there isn't)) """ if info[0]: - params = {'action': 'edit','artistid':id} + params = {'action': 'edit', 'artistid':id} data = urllib.urlencode(params) - edit_page = BeautifulSoup(self._request("GET", self.artistpage +"?"+ data, "", self.whatcd.headers).execute(True).body) - what_form = self._parser().whatForm(edit_page,'edit') + edit_page = BeautifulSoup(self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).body) + what_form = self._parser().whatForm(edit_page, 'edit') if info[1]: image_to_post = info[1] else: @@ -1010,433 +1019,1880 @@ class Artist(WhatBase): 'action':what_form['action']} #post artist's info - self.whatcd.headers['Content-type']="application/x-www-form-urlencoded" + self.whatcd.headers['Content-type'] = "application/x-www-form-urlencoded" response = self._request("POST", self.artistpage, urllib.urlencode(data_to_post), self.whatcd.headers).execute(False) artist_id_returned = dict(response.headers)['location'][14:] - if str(artist_id_returned) == str(what_form['artistid']) : + if str(artist_id_returned) == str(what_form['artistid']): return 1 else: return 0 else: - return 'no artist info provided. Aborting.' - exit() + return 'no artist info provided. Aborting.' + exit() class Parser(object): - def __init__(self,whatcd): - self.utils = Utils() - self.whatcd = whatcd - self.totalpages = 0 + def __init__(self, whatcd): + self.utils = Utils() + self.whatcd = whatcd + self.totalpages = 0 - def authenticatedUserInfo(self, dom): - """ - Parse the index page and returns a dictionnary with basic authenticated user information - """ - userInfo = {} - soup = BeautifulSoup(str(dom)) - for ul in soup.fetch('ul'): - if ul["id"] == "userinfo_username": - #retrieve user logged id - hrefid = ul.findAll('li')[0].find("a")["href"] - regid = re.compile('[0-9]+') - if regid.search(hrefid) is None: - self.debugMessage("not found href to retrieve user id") - else: - userInfo["id"] = regid.search(hrefid).group(0) + def authenticatedUserInfo(self, dom): + """ + Parse the index page and returns a dictionnary with basic authenticated user information + """ + userInfo = {} + soup = BeautifulSoup(str(dom)) + for ul in soup.fetch('ul'): + if ul["id"] == "userinfo_username": + #retrieve user logged id + hrefid = ul.findAll('li')[0].find("a")["href"] + regid = re.compile('[0-9]+') + if regid.search(hrefid) is None: + self.debugMessage("not found href to retrieve user id") + else: + userInfo["id"] = regid.search(hrefid).group(0) - #retrieve user logged id - hrefauth = ul.findAll('li')[2].find("a")["href"] - regauth = re.compile('=[0-9a-fA-F]+') - if regid.search(hrefid) is None: - self.debugMessage("not found href to retrieve user id") - else: - userInfo["authcode"] = regauth.search(hrefauth).group(0)[1:] + #retrieve user logged id + hrefauth = ul.findAll('li')[2].find("a")["href"] + regauth = re.compile('=[0-9a-fA-F]+') + if regid.search(hrefid) is None: + self.debugMessage("not found href to retrieve user id") + else: + userInfo["authcode"] = regauth.search(hrefauth).group(0)[1:] - elif ul["id"] == "userinfo_stats": - if len(ul.findAll('li')) > 0: - userInfo["uploaded"] = ul.findAll('li')[0].find("span").string - userInfo["downloaded"] = ul.findAll('li')[1].find("span").string - userInfo["ratio"] = ul.findAll('li')[2].findAll("span")[1].string - userInfo["required"] = ul.findAll('li')[3].find("span").string - userInfo["authenticate"] = True + elif ul["id"] == "userinfo_stats": + if len(ul.findAll('li')) > 0: + userInfo["uploaded"] = ul.findAll('li')[0].find("span").string + userInfo["downloaded"] = ul.findAll('li')[1].find("span").string + userInfo["ratio"] = ul.findAll('li')[2].findAll("span")[1].string + userInfo["required"] = ul.findAll('li')[3].find("span").string + userInfo["authenticate"] = True - return userInfo + return userInfo - def userInfo(self, dom, user): - """ - Parse an user's page and returns a dictionnary with its information + def userInfo(self, dom, user): + """ + Parse an user's page and returns a dictionnary with its information - # Parameters: - * dom str: user page html - * user str: what.cd username - """ - userInfo = {'stats':{}, 'percentile':{}, 'community':{}} - soup = BeautifulSoup(str(dom)) + # Parameters: + * dom str: user page html + * user str: what.cd username + """ + userInfo = {'stats':{}, 'percentile':{}, 'community':{}} + soup = BeautifulSoup(str(dom)) - for div in soup.fetch('div',{'class':'box'}): + for div in soup.fetch('div', {'class':'box'}): - #if paronoia is not set to 'Off', stop collecting data - if div.findAll('div')[0].string == "Personal": - if div.find('ul').findAll('li')[1].contents[1].string.strip() != "Off": - return None + #if paronoia is not set to 'Off', stop collecting data + if div.findAll('div')[0].string == "Personal": + if div.find('ul').findAll('li')[1].contents[1].string.strip() != "Off": + return None - statscontainer = soup.findAll('div', {'class':'box'})[1] - percentilecontainer = soup.findAll('div', {'class':'box'})[2] - communitycontainer = soup.findAll('div', {'class':'box'})[4] + statscontainer = soup.findAll('div', {'class':'box'})[1] + percentilecontainer = soup.findAll('div', {'class':'box'})[2] + communitycontainer = soup.findAll('div', {'class':'box'})[4] - userInfo['stats']['joined'] = statscontainer.findAll('li')[0].find('span')['title'] - userInfo['stats']['lastseen'] = statscontainer.findAll('li')[1].find('span')['title'] - userInfo['stats']['uploaded'] = statscontainer.findAll('li')[2].string[10:] - userInfo['stats']['downloaded'] = statscontainer.findAll('li')[3].string[12:] - userInfo['stats']['ratio'] = statscontainer.findAll('li')[4].find('span').string - userInfo['stats']['rratio'] = statscontainer.findAll('li')[5].string[16:] - userInfo['percentile']['dataup'] = percentilecontainer.findAll('li')[0].string[15:] - userInfo['percentile']['datadown'] = percentilecontainer.findAll('li')[1].string[17:] - userInfo['percentile']['torrentsup'] = percentilecontainer.findAll('li')[2].string[19:] - userInfo['percentile']['reqfilled'] = percentilecontainer.findAll('li')[3].string[17:] - userInfo['percentile']['bountyspent'] = percentilecontainer.findAll('li')[4].string[14:] - userInfo['percentile']['postsmade'] = percentilecontainer.findAll('li')[5].string[12:] - userInfo['percentile']['artistsadded'] = percentilecontainer.findAll('li')[6].string[15:] - userInfo['percentile']['overall'] = percentilecontainer.findAll('li')[7].find('strong').string[14:] + userInfo['stats']['joined'] = statscontainer.findAll('li')[0].find('span')['title'] + userInfo['stats']['lastseen'] = statscontainer.findAll('li')[1].find('span')['title'] + userInfo['stats']['uploaded'] = statscontainer.findAll('li')[2].string[10:] + userInfo['stats']['downloaded'] = statscontainer.findAll('li')[3].string[12:] + userInfo['stats']['ratio'] = statscontainer.findAll('li')[4].find('span').string + userInfo['stats']['rratio'] = statscontainer.findAll('li')[5].string[16:] + userInfo['percentile']['dataup'] = percentilecontainer.findAll('li')[0].string[15:] + userInfo['percentile']['datadown'] = percentilecontainer.findAll('li')[1].string[17:] + userInfo['percentile']['torrentsup'] = percentilecontainer.findAll('li')[2].string[19:] + userInfo['percentile']['reqfilled'] = percentilecontainer.findAll('li')[3].string[17:] + userInfo['percentile']['bountyspent'] = percentilecontainer.findAll('li')[4].string[14:] + userInfo['percentile']['postsmade'] = percentilecontainer.findAll('li')[5].string[12:] + userInfo['percentile']['artistsadded'] = percentilecontainer.findAll('li')[6].string[15:] + userInfo['percentile']['overall'] = percentilecontainer.findAll('li')[7].find('strong').string[14:] - userInfo['community']['forumposts'] = (communitycontainer.findAll('li')[0].contents[0].string[13:len(communitycontainer.findAll('li')[0].contents[0].string)-2],\ - communitycontainer.findAll('li')[0].find('a')['href']) - userInfo['community']['torrentscomments'] = (communitycontainer.findAll('li')[1].contents[0].string[18:len(communitycontainer.findAll('li')[1].contents[0].string)-2],\ - communitycontainer.findAll('li')[1].find('a')['href']) - userInfo['community']['startedcollages'] = (communitycontainer.findAll('li')[2].contents[0].string[18:len(communitycontainer.findAll('li')[2].contents[0].string)-2],\ - communitycontainer.findAll('li')[2].find('a')['href']) - userInfo['community']['contributedcollages'] = (communitycontainer.findAll('li')[3].contents[0].string[25:len(communitycontainer.findAll('li')[3].contents[0].string)-2],\ + userInfo['community']['forumposts'] = (communitycontainer.findAll('li')[0].contents[0].string[13:len(communitycontainer.findAll('li')[0].contents[0].string)-2],\ + communitycontainer.findAll('li')[0].find('a')['href']) + userInfo['community']['torrentscomments'] = (communitycontainer.findAll('li')[1].contents[0].string[18:len(communitycontainer.findAll('li')[1].contents[0].string)-2],\ + communitycontainer.findAll('li')[1].find('a')['href']) + userInfo['community']['startedcollages'] = (communitycontainer.findAll('li')[2].contents[0].string[18:len(communitycontainer.findAll('li')[2].contents[0].string)-2],\ + communitycontainer.findAll('li')[2].find('a')['href']) + userInfo['community']['contributedcollages'] = (communitycontainer.findAll('li')[3].contents[0].string[25:len(communitycontainer.findAll('li')[3].contents[0].string)-2],\ communitycontainer.findAll('li')[3].find('a')['href']) - userInfo['community']['reqfilled'] = (communitycontainer.findAll('li')[4].contents[0].string[17:len(communitycontainer.findAll('li')[4].contents[0].string)-2],\ - communitycontainer.findAll('li')[4].find('a')['href']) - userInfo['community']['reqvoted'] = (communitycontainer.findAll('li')[5].contents[0].string[16:len(communitycontainer.findAll('li')[5].contents[0].string)-2],\ - communitycontainer.findAll('li')[5].find('a')['href']) - userInfo['community']['uploaded'] = (communitycontainer.findAll('li')[6].contents[0].string[10:len(communitycontainer.findAll('li')[6].contents[0].string)-2],\ - communitycontainer.findAll('li')[6].find('a')['href']) - userInfo['community']['uniquegroups'] = (communitycontainer.findAll('li')[7].contents[0].string[15:len(communitycontainer.findAll('li')[7].contents[0].string)-2],\ - communitycontainer.findAll('li')[7].find('a')['href']) - userInfo['community']['pefectflacs'] = (communitycontainer.findAll('li')[8].contents[0].string[16:len(communitycontainer.findAll('li')[8].contents[0].string)-2],\ - communitycontainer.findAll('li')[8].find('a')['href']) - userInfo['community']['seeding'] = (communitycontainer.findAll('li')[9].contents[0].string[9:len(communitycontainer.findAll('li')[9].contents[0].string)-2],\ - communitycontainer.findAll('li')[9].find('a')['href']) - userInfo['community']['leeching'] = (communitycontainer.findAll('li')[10].contents[0].string[10:len(communitycontainer.findAll('li')[10].contents[0].string)-2],\ - communitycontainer.findAll('li')[10].find('a')['href']) - #NB: there's a carriage return and white spaces inside the snatched li tag - userInfo['community']['snatched'] = (communitycontainer.findAll('li')[11].contents[0].string[10:len(communitycontainer.findAll('li')[11].contents[0].string)-7],\ - communitycontainer.findAll('li')[11].find('a')['href']) - userInfo['community']['invited'] = (communitycontainer.findAll('li')[12].contents[0].string[9:],\ - None) - userInfo['community']['artists'] = percentilecontainer.findAll('li')[6]['title'] + userInfo['community']['reqfilled'] = (communitycontainer.findAll('li')[4].contents[0].string[17:len(communitycontainer.findAll('li')[4].contents[0].string)-2],\ + communitycontainer.findAll('li')[4].find('a')['href']) + userInfo['community']['reqvoted'] = (communitycontainer.findAll('li')[5].contents[0].string[16:len(communitycontainer.findAll('li')[5].contents[0].string)-2],\ + communitycontainer.findAll('li')[5].find('a')['href']) + userInfo['community']['uploaded'] = (communitycontainer.findAll('li')[6].contents[0].string[10:len(communitycontainer.findAll('li')[6].contents[0].string)-2],\ + communitycontainer.findAll('li')[6].find('a')['href']) + userInfo['community']['uniquegroups'] = (communitycontainer.findAll('li')[7].contents[0].string[15:len(communitycontainer.findAll('li')[7].contents[0].string)-2],\ + communitycontainer.findAll('li')[7].find('a')['href']) + userInfo['community']['pefectflacs'] = (communitycontainer.findAll('li')[8].contents[0].string[16:len(communitycontainer.findAll('li')[8].contents[0].string)-2],\ + communitycontainer.findAll('li')[8].find('a')['href']) + userInfo['community']['seeding'] = (communitycontainer.findAll('li')[9].contents[0].string[9:len(communitycontainer.findAll('li')[9].contents[0].string)-2],\ + communitycontainer.findAll('li')[9].find('a')['href']) + userInfo['community']['leeching'] = (communitycontainer.findAll('li')[10].contents[0].string[10:len(communitycontainer.findAll('li')[10].contents[0].string)-2],\ + communitycontainer.findAll('li')[10].find('a')['href']) + #NB: there's a carriage return and white spaces inside the snatched li tag + userInfo['community']['snatched'] = (communitycontainer.findAll('li')[11].contents[0].string[10:len(communitycontainer.findAll('li')[11].contents[0].string)-7],\ + communitycontainer.findAll('li')[11].find('a')['href']) + userInfo['community']['invited'] = (communitycontainer.findAll('li')[12].contents[0].string[9:],\ + None) + userInfo['community']['artists'] = percentilecontainer.findAll('li')[6]['title'] - return userInfo + return userInfo - def torrentInfo(self, dom, id, isparent): - """ - Parse a torrent's page and returns a dictionnary with its information - """ + def torrentInfo(self, dom, id, isparent): + """ + Parse a torrent's page and returns a dictionnary with its information + """ - torrentInfo = {'torrent':{}} - torrentfiles = [] - torrentdescription = "" - isreported = False - isfreeleech = False - soup = BeautifulSoup(str(dom)) - if isparent: - torrentInfo['torrent']['parentid'] = id - else: - groupidurl = soup.findAll('div', {'class':'linkbox'})[0].find('a')['href'] - torrentInfo['torrent']['editioninfo'] = soup.findAll('td', {'class':'edition_info'})[0].find('strong').contents[-1] - regrlsmedia = re.compile('CD|DVD|Vinyl|Soundboard|SACD|Cassette|WEB|Blu-ray') - torrentInfo['torrent']['rlsmedia'] = regrlsmedia.search(torrentInfo['torrent']['editioninfo']).group(0) - torrentInfo['torrent']['parentid'] = groupidurl[groupidurl.rfind("=")+1:] - torrentInfo['torrent']['downloadurl'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a',{'title':'Download'})[0]['href'] - ## is freeleech or/and reported? ## - #both - if len(soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents) == 4: + torrentInfo = {'torrent':{}} + torrentfiles = [] + torrentdescription = "" + isreported = False + isfreeleech = False + soup = BeautifulSoup(str(dom)) + if isparent: + torrentInfo['torrent']['parentid'] = id + else: + groupidurl = soup.findAll('div', {'class':'linkbox'})[0].find('a')['href'] + torrentInfo['torrent']['editioninfo'] = soup.findAll('td', {'class':'edition_info'})[0].find('strong').contents[-1] + regrlsmedia = re.compile('CD|DVD|Vinyl|Soundboard|SACD|Cassette|WEB|Blu-ray') + torrentInfo['torrent']['rlsmedia'] = regrlsmedia.search(torrentInfo['torrent']['editioninfo']).group(0) + torrentInfo['torrent']['parentid'] = groupidurl[groupidurl.rfind("=") + 1:] + torrentInfo['torrent']['downloadurl'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a', {'title':'Download'})[0]['href'] + ## is freeleech or/and reported? ## + #both + if len(soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents) == 4: + isreported = True + isfreeleech = True + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + #either + elif len(soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents) == 2: + if soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[1].string == 'Reported': isreported = True - isfreeleech = True - torrentInfo['torrent']['details'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents[0] - #either - elif len(soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents) == 2: - if soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents[1].string == 'Reported': - isreported = True - elif soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents[1].string == 'Freeleech!': - isreported = True - torrentInfo['torrent']['details'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents[0] - #none - else: - torrentInfo['torrent']['details'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('a')[-1].contents[0] - torrentInfo['torrent']['isfreeleech'] = isfreeleech - torrentInfo['torrent']['isreported'] = isreported - torrentInfo['torrent']['size'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('td')[1].string - torrentInfo['torrent']['snatched'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('td')[2].string - torrentInfo['torrent']['seeders'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('td')[3].string - torrentInfo['torrent']['leechers'] = soup.findAll('tr',{'id':'torrent%s'%id})[0].findAll('td')[4].string - torrentInfo['torrent']['uploadedby'] = soup.findAll('tr',{'id':'torrent_%s'%id})[0].findAll('a')[0].string - foldername = soup.findAll('div',{'id':'files_%s'%id})[0].findAll('div')[1].string - if(foldername is None): - torrentInfo['torrent']['foldername'] = None - else: - torrentInfo['torrent']['foldername'] = self.utils.decodeHTMLEntities(foldername) - files = soup.findAll('div',{'id':'files_%s'%id})[0].findAll('tr') - for file in files[1:-1]: - torrentfiles.append(self.utils.decodeHTMLEntities(file.contents[0].string)) - torrentInfo['torrent']['filelist'] = torrentfiles - #is there any description? - if len(soup.findAll('tr',{'id':'torrent_%s'%id})[0].findAll('blockquote')) > 1: - description = torrentInfo['torrent']['description'] = soup.findAll('tr',{'id':'torrent_%s'%id})[0].findAll('blockquote')[1].contents - info = '' - for content in description: - if content.string: - info = "%s%s" % (info, self.utils._string(content.string)) - torrentdescription = "%s%s" % (torrentdescription, self.utils._string(content.string)) - torrentInfo['torrent']['torrentdescription'] = torrentdescription - regrlstype = re.compile('Album|Soundtrack|EP|Anthology|Compilation|DJ Mix|Single|Live album|Remix|Bootleg|Interview|Mixtape|Unknown') - torrentInfo['torrent']['rlstype'] = regrlstype.search(soup.find('div', {'class':'thin'}).find('h2').contents[1]).group(0) - - torrentInfo['torrent']['comments'] = [] - torrentInfo['torrent']['commentspages'] = 0 - - if len(soup.findAll('table', {'class':'forum_post box vertical_margin'})) > 0: - linkbox = dom.findAll("div", {"class": "linkbox"})[-1] - pages = 1 - postid = '' - userid = '' - post = '' - # if there's more than 1 page of torrents - if linkbox.find("a"): - # by default torrent page show last page of comments - lastpage = linkbox.findAll("a")[-1]['href'] - pages = int(lastpage[18:lastpage.find('&')]) +1 - for comment in soup.findAll('table', {'class':'forum_post box vertical_margin'}): - postid = comment.find("a",{"class":"post_id"}).string[1:] - userid = comment.findAll("a")[1]['href'][12:] - username = comment.findAll("a")[1].string - post = comment.find("div", {"id":"content"+postid}) - post = u''.join([post.string for post in post.findAll(text=True)]) - torrentInfo['torrent']['comments'].append({"postid":postid,"post":post,"userid":userid,"username":username}) - - torrentInfo['torrent']['commentspages'] = pages - - return torrentInfo - - def artistInfo(self, dom): - """ - Parse an artist's page and returns a dictionnary with its information - """ - artistInfo = {} - releases = [] - requests = [] - infoartist = "" - tagsartist = [] - similarartists = [] - soup = BeautifulSoup(str(dom)) - soupfetch = soup.fetch('table',{'class':'torrent_table'}) - if not soupfetch: - soupfetch = soup.fetch('table',{'class':'torrent_table grouped release_table'}) - for releasetype in soupfetch: - releasetypenames = releasetype.findAll('strong') - releasetypename = releasetype.findAll('strong')[0].string - for release in releasetypenames[1:-1]: - #skip release edition info and Freeleech! s - if len(release.parent.contents) > 1 and len(release.contents) > 1 : - releaseyear = release.contents[0][0:4] - releasename = release.contents[1].string - releasehref = release.contents[1]['href'] - releaseid = releasehref[releasehref.rfind('=')+1:] - releases.append({'releasetype':releasetypename,\ - 'year': releaseyear,'name':self.utils.decodeHTMLEntities(releasename),'id':releaseid}) - - artistInfo['releases'] = releases - #is there an artist image? - artistInfo['image'] = None - if soup.find('div', {'class':'box'}).find('img'): - artistInfo['image'] = soup.find('div', {'class':'box'}).find('img')['src'] - #is there any artist info? - contents = soup.find('div', {'class':'body'}).contents - if len(contents) > 0: - for content in contents: + elif soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[1].string == 'Freeleech!': + isreported = True + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + #none + else: + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + torrentInfo['torrent']['isfreeleech'] = isfreeleech + torrentInfo['torrent']['isreported'] = isreported + torrentInfo['torrent']['size'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[1].string + torrentInfo['torrent']['snatched'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[2].string + torrentInfo['torrent']['seeders'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[3].string + torrentInfo['torrent']['leechers'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[4].string + torrentInfo['torrent']['uploadedby'] = soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('a')[0].string + foldername = soup.findAll('div', {'id':'files_%s' % id})[0].findAll('div')[1].string + if(foldername is None): + torrentInfo['torrent']['foldername'] = None + else: + torrentInfo['torrent']['foldername'] = self.utils.decodeHTMLEntities(foldername) + files = soup.findAll('div', {'id':'files_%s' % id})[0].findAll('tr') + for file in files[1:-1]: + torrentfiles.append(self.utils.decodeHTMLEntities(file.contents[0].string)) + torrentInfo['torrent']['filelist'] = torrentfiles + #is there any description? + if len(soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('blockquote')) > 1: + description = torrentInfo['torrent']['description'] = soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('blockquote')[1].contents + info = '' + for content in description: if content.string: - infoartist = "%s%s" % (infoartist, self.utils._string(content.string)) - artistInfo['info'] = self.utils.decodeHTMLEntities(infoartist) - #is there any artist tags? - if soup.findAll('ul',{'class':'stats nobullet'})[0].findAll('li'): - ul = soup.findAll('ul',{'class':'stats nobullet'})[0].findAll('li') - for li in ul: - if li.contents[0].string: - tagsartist.append(self.utils._string(li.contents[0].string)) - artistInfo['tags'] = tagsartist - #is there any similar artist? - if soup.findAll('ul',{'class':'stats nobullet'})[2].findAll('span',{'title':'2'}): - artists = soup.findAll('ul',{'class':'stats nobullet'})[2].findAll('span',{'title':'2'}) - for artist in artists: - if artist.contents[0].string: - similarartists.append(self.utils._string(artist.contents[0].string)) - artistInfo['similarartists'] = similarartists - #is there any request? - if soup.find('table',{'id':'requests'}): - for request in soup.find('table',{'id':'requests'}).findAll('tr',{'class':re.compile('row')}): - requests.append({'requestname':request.findAll('a')[1].string,'id':request.findAll('a')[1]['href'][28:]}) + info = "%s%s" % (info, self.utils._string(content.string)) + torrentdescription = "%s%s" % (torrentdescription, self.utils._string(content.string)) + torrentInfo['torrent']['torrentdescription'] = torrentdescription + regrlstype = re.compile('Album|Soundtrack|EP|Anthology|Compilation|DJ Mix|Single|Live album|Remix|Bootleg|Interview|Mixtape|Unknown') + torrentInfo['torrent']['rlstype'] = regrlstype.search(soup.find('div', {'class':'thin'}).find('h2').contents[1]).group(0) - artistInfo['requests'] = requests + torrentInfo['torrent']['comments'] = [] + torrentInfo['torrent']['commentspages'] = 0 - return artistInfo + if len(soup.findAll('table', {'class':'forum_post box vertical_margin'})) > 0: + linkbox = dom.findAll("div", {"class": "linkbox"})[-1] + pages = 1 + postid = '' + userid = '' + post = '' + # if there's more than 1 page of torrents + if linkbox.find("a"): + # by default torrent page show last page of comments + lastpage = linkbox.findAll("a")[-1]['href'] + pages = int(lastpage[18:lastpage.find('&')]) + 1 + for comment in soup.findAll('table', {'class':'forum_post box vertical_margin'}): + postid = comment.find("a", {"class":"post_id"}).string[1:] + userid = comment.findAll("a")[1]['href'][12:] + username = comment.findAll("a")[1].string + post = comment.find("div", {"id":"content" + postid}) + post = u''.join([post.string for post in post.findAll(text=True)]) + torrentInfo['torrent']['comments'].append({"postid":postid, "post":post, "userid":userid, "username":username}) - def torrentsList(self,dom): - """ - Parse a torrent's list page and returns a dictionnary with its information - """ - torrentslist = [] - torrentssoup = dom.find("table", {"width": "100%"}) - pages = 0 + torrentInfo['torrent']['commentspages'] = pages - #if there's at least 1 torrent in the list - if torrentssoup: - navsoup = dom.find("div", {"class": "linkbox"}) - pages = 1 - regyear = re.compile('\[\d{4}\]') + return torrentInfo - #is there a page navigation bar? - if navsoup.contents: - #if there's more than 1 page of torrents - if navsoup.contents[-1].has_key('href'): - lastpage = navsoup.contents[-1]['href'] - pages = lastpage[18:lastpage.find('&')] - self.totalpages = pages - else: #we are at the last page, no href - pages = self.totalpages+1 - #fetch all tr except first one (column head) - for torrent in torrentssoup.fetch('tr')[1:]: - #exclude non music torrents - if torrent.find('td').find('div')['class'][0:10] == 'cats_music': + def artistInfo(self, dom): + """ + Parse an artist's page and returns a dictionnary with its information + """ + artistInfo = {} + releases = [] + requests = [] + infoartist = "" + tagsartist = [] + similarartists = [] + soup = BeautifulSoup(str(dom)) + for releasetype in soup.fetch('table', {'class':'torrent_table'}): + releasetypenames = releasetype.findAll('strong') + releasetypename = releasetype.findAll('strong')[0].string + for release in releasetypenames[1:-1]: + #skip release edition info and Freeleech! s + if len(release.parent.contents) > 1 and len(release.contents) > 1: + releaseyear = release.contents[0][0:4] + releasename = release.contents[1].string + releasehref = release.contents[1]['href'] + releaseid = releasehref[releasehref.rfind('=') + 1:] + releases.append({'releasetype':releasetypename,\ + 'year': releaseyear, 'name':self.utils.decodeHTMLEntities(releasename), 'id':releaseid}) - torrenttag = torrent.find('td').contents[1]['title'] - torrentdl = torrent.findAll('td')[1].find('span').findAll('a')[0]['href'] - torrentrm = torrent.findAll('td')[1].find('span').findAll('a')[1]['href'] - torrentid = torrentrm[torrentrm.rfind('=')+1:] - torrenttd = torrent.findAll('td')[1] + artistInfo['releases'] = releases + #is there an artist image? + artistInfo['image'] = None + if soup.find('div', {'class':'box'}).find('img'): + artistInfo['image'] = soup.find('div', {'class':'box'}).find('img')['src'] + #is there any artist info? + contents = soup.find('div', {'class':'body'}).contents + if len(contents) > 0: + for content in contents: + if content.string: + infoartist = "%s%s" % (infoartist, self.utils._string(content.string)) + artistInfo['info'] = self.utils.decodeHTMLEntities(infoartist) + #is there any artist tags? + if soup.findAll('ul', {'class':'stats nobullet'})[0].findAll('li'): + ul = soup.findAll('ul', {'class':'stats nobullet'})[0].findAll('li') + for li in ul: + if li.contents[0].string: + tagsartist.append(self.utils._string(li.contents[0].string)) + artistInfo['tags'] = tagsartist + #is there any similar artist? + if soup.findAll('ul', {'class':'stats nobullet'})[2].findAll('span', {'title':'2'}): + artists = soup.findAll('ul', {'class':'stats nobullet'})[2].findAll('span', {'title':'2'}) + for artist in artists: + if artist.contents[0].string: + similarartists.append(self.utils._string(artist.contents[0].string)) + artistInfo['similarartists'] = similarartists + #is there any request? + if soup.find('table', {'id':'requests'}): + for request in soup.find('table', {'id':'requests'}).findAll('tr', {'class':re.compile('row')}): + requests.append({'requestname':request.findAll('a')[1].string, 'id':request.findAll('a')[1]['href'][28:]}) - # remove dataless elements - torrenttags = torrenttd.div - rightlinks = torrenttd.span - torrenttags.extract() - rightlinks.extract() + artistInfo['requests'] = requests - # remove line breaks - torrenttd = "".join([line.strip() for line in str(torrenttd).split("\n")]) - torrenttd = BeautifulSoup(torrenttd) - isScene = False - info = "" + return artistInfo - if len(torrenttd.findAll('a')) == 2: - #one artist - torrentartist = (self.utils.decodeHTMLEntities(torrenttd.find("a").string),) - artistid = (torrenttd.find("a")['href'][14:],) - torrentalbum = torrenttd.findAll("a")[1].string - info = torrenttd.findAll("a")[1].nextSibling.string.strip() + def torrentsList(self, dom): + """ + Parse a torrent's list page and returns a dictionnary with its information + """ + torrentslist = [] + torrentssoup = dom.find("table", {"width": "100%"}) + pages = 0 + #if there's at least 1 torrent in the list + if torrentssoup: + navsoup = dom.find("div", {"class": "linkbox"}) + pages = 1 + regyear = re.compile('\[\d{4}\]') - elif len(torrenttd.findAll('a')) == 1: - #various artists - torrentartist = ('Various Artists',) - artistid = () - torrentalbum = torrenttd.find("a").string - info = torrenttd.find("a").nextSibling.string.strip() - - elif len(torrenttd.findAll('a')) == 3: - #two artists - torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[0].string), \ - self.utils.decodeHTMLEntities(torrenttd.findAll("a")[1].string)) - artistid = (torrenttd.findAll("a")[0]['href'][14:],\ - torrenttd.findAll("a")[1]['href'][14:]) - torrentalbum = torrenttd.findAll("a")[2].string - info = torrenttd.findAll("a")[2].nextSibling.string.strip() - - elif torrenttd.find(text=re.compile('performed by')): - #performed by - torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[-2].string),) - artistid = (torrenttd.findAll("a")[-2]['href'][14:],) - torrentalbum = torrenttd.findAll("a")[-1].string - info = torrenttd.findAll("a")[-1].nextSibling.string.strip() - - if 'Scene' in info: - isScene = True - - torrentyear = regyear.search(info).group(0)[1:5] - torrentslist.append({'tag':torrenttag,\ - 'dlurl':torrentdl,\ - 'id':torrentid, \ - 'artist':torrentartist,\ - 'artistid':artistid,\ - 'album':self.utils.decodeHTMLEntities(torrentalbum), - 'year':torrentyear, - 'pages':pages, - 'scene':isScene}) - - return torrentslist - - def postsList(self,dom): - """ - Parse a post list page and returns a dictionnary with each post information: - {torrentid, commentid, postid} - """ - postslist = [] - postssoup = dom.find("div", {"class": "thin"}) - pages = 0 - - #if there's at least 1 post in the list - if postssoup: - navsoup = dom.find("div", {"class": "linkbox"}) - + #is there a page navigation bar? + if navsoup.contents: #if there's more than 1 page of torrents - if navsoup.find("a"): - lastpage = navsoup.findAll("a")[1]['href'] + if navsoup.contents[-1].has_key('href'): + lastpage = navsoup.contents[-1]['href'] pages = lastpage[18:lastpage.find('&')] self.totalpages = pages - else: #we are at the last page, no link - pages = 1 + else: #we are at the last page, no href + pages = self.totalpages + 1 + #fetch all tr except first one (column head) + for torrent in torrentssoup.fetch('tr')[1:]: + #exclude non music torrents + if torrent.find('td').find('div')['class'][0:10] == 'cats_music': - for post in postssoup.fetch('table', {'class':'forum_post box vertical_margin'}): - commentbody = post.find("td", {"class":"body"}) - postid = post.find("span").findAll("a")[0].string[1:] - torrentid = post.find("span").findAll("a")[-1]['href'][post.find("span").findAll("a")[-1]['href'].rfind('=')+1:] - comment = u''.join([commentbody.string for commentbody in commentbody.findAll(text=True)]) - postdate = post.find("span", {"class":"time"})['title'] - postslist.append({'postid':postid,\ - 'torrentid':torrentid,\ - 'comment':comment,\ - 'postdate':postdate,\ - 'pages':pages}) + torrenttag = torrent.find('td').contents[1]['title'] + torrentdl = torrent.findAll('td')[1].find('span').findAll('a')[0]['href'] + torrentrm = torrent.findAll('td')[1].find('span').findAll('a')[1]['href'] + torrentid = torrentrm[torrentrm.rfind('=') + 1:] + torrenttd = torrent.findAll('td')[1] + + # remove dataless elements + torrenttags = torrenttd.div + rightlinks = torrenttd.span + torrenttags.extract() + rightlinks.extract() + + # remove line breaks + torrenttd = "".join([line.strip() for line in str(torrenttd).split("\n")]) + torrenttd = BeautifulSoup(torrenttd) + isScene = False + info = "" + + if len(torrenttd.findAll('a')) == 2: + #one artist + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.find("a").string),) + artistid = (torrenttd.find("a")['href'][14:],) + torrentalbum = torrenttd.findAll("a")[1].string + info = torrenttd.findAll("a")[1].nextSibling.string.strip() - return postslist + elif len(torrenttd.findAll('a')) == 1: + #various artists + torrentartist = ('Various Artists',) + artistid = () + torrentalbum = torrenttd.find("a").string + info = torrenttd.find("a").nextSibling.string.strip() + + elif len(torrenttd.findAll('a')) == 3: + #two artists + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[0].string),\ + self.utils.decodeHTMLEntities(torrenttd.findAll("a")[1].string)) + artistid = (torrenttd.findAll("a")[0]['href'][14:],\ + torrenttd.findAll("a")[1]['href'][14:]) + torrentalbum = torrenttd.findAll("a")[2].string + info = torrenttd.findAll("a")[2].nextSibling.string.strip() + + elif torrenttd.find(text=re.compile('performed by')): + #performed by + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[-2].string),) + artistid = (torrenttd.findAll("a")[-2]['href'][14:],) + torrentalbum = torrenttd.findAll("a")[-1].string + info = torrenttd.findAll("a")[-1].nextSibling.string.strip() + + if 'Scene' in info: + isScene = True + + torrentyear = regyear.search(info).group(0)[1:5] + torrentslist.append({'tag':torrenttag,\ + 'dlurl':torrentdl,\ + 'id':torrentid,\ + 'artist':torrentartist,\ + 'artistid':artistid,\ + 'album':self.utils.decodeHTMLEntities(torrentalbum), + 'year':torrentyear, + 'pages':pages, + 'scene':isScene}) + + return torrentslist + + def postsList(self, dom): + """ + Parse a post list page and returns a dictionnary with each post information: + {torrentid, commentid, postid} + """ + postslist = [] + postssoup = dom.find("div", {"class": "thin"}) + pages = 0 + + #if there's at least 1 post in the list + if postssoup: + navsoup = dom.find("div", {"class": "linkbox"}) + + #if there's more than 1 page of torrents + if navsoup.find("a"): + lastpage = navsoup.findAll("a")[1]['href'] + pages = lastpage[18:lastpage.find('&')] + self.totalpages = pages + else: #we are at the last page, no link + pages = 1 + + for post in postssoup.fetch('table', {'class':'forum_post box vertical_margin'}): + commentbody = post.find("td", {"class":"body"}) + postid = post.find("span").findAll("a")[0].string[1:] + torrentid = post.find("span").findAll("a")[-1]['href'][post.find("span").findAll("a")[-1]['href'].rfind('=') + 1:] + comment = u''.join([commentbody.string for commentbody in commentbody.findAll(text=True)]) + postdate = post.find("span", {"class":"time"})['title'] + postslist.append({'postid':postid,\ + 'torrentid':torrentid,\ + 'comment':comment,\ + 'postdate':postdate,\ + 'pages':pages}) - def whatForm(self, dom, action): - """ - Parse a what.cd edit page and returns a dict with all form inputs/textareas names and values - # Parameters: - * dom str: the edit page dom. - + action str: the action value from the requested form - """ - inputs = {} + return postslist - form = dom.find('input',{'name':'action','value':action}).parent - elements = form.fetch(('input','textarea')) - #get all form elements except for submit input - for element in elements[0:-1]: - name = element.get('name',None) - if element.name == 'textarea': - inputs[name] = element.string - else: - inputs[name] = element.get('value',None) - return inputs + + def whatForm(self, dom, action): + """ + Parse a what.cd edit page and returns a dict with all form inputs/textareas names and values + # Parameters: + * dom str: the edit page dom. + + action str: the action value from the requested form + """ + inputs = {} + + form = dom.find('input', {'name':'action', 'value':action}).parent + elements = form.fetch(('input', 'textarea')) + #get all form elements except for submit input + for element in elements[0:3]: + name = element.get('name', None) + if element.name == 'textarea': + inputs[name] = element.string + else: + inputs[name] = element.get('value', None) + return inputs if __name__ == "__main__": - print "Module to manage what.cd as a web service" + print "Module to manage what.cd as a web service" +# -*- coding: utf_8 -*- +################################################################################# +# +# Name: whatapi.py +# +# Synopsis: Module to manage what.cd as a web service +# +# Description: See below list of the implemented webservices +# +# Copyright 2010 devilcius +# +# The Wide Open License (WOL) +# +# Permission to use, copy, modify, distribute and sell this software and its +# documentation for any purpose is hereby granted without fee, provided that +# the above copyright notice and this license appear in all source copies. +# THIS SOFTWARE IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF +# ANY KIND. See http://www.dspguru.com/wide-open-license for more information. +# +################################################################################# + + +__author__ = "devilcius" +__date__ = "$Oct 23, 2010 11:21:12 PM$" + + +import hashlib +try: + from BeautifulSoup import BeautifulSoup +except: + raise ImportError, "Please install BeautifulSoup 3.2 module from http://www.crummy.com/software/BeautifulSoup/#Download" +import httplib +import os +import pickle +import re +import urllib +import shelve +import tempfile +from htmlentitydefs import name2codepoint as n2cp + + +""" +A list of the implemented webservices (from what.cd ) +===================================== + +# User + + * user.getUserId + * user.getInfo + + * user.getTorrentsSeeding + * user.getTorrentsSnatched + * user.getTorrentsUploaded + * user.getTorrentsCommented + + * user.specificUserInfo + Atributes: + ######## stats ########### + -joindate + -lastseen + -dataup + -datadown + -ratio + -rratio + ######## percentile ########### + -uppercentile + -downpercentile + -torrentsuppercentile + -reqfilledpercentile + -bountyspentpercentile + -postsmadepercentile + -artistsaddedpercentile + -overallpercentile + ######## community ########### + -postsmade + -torrentscomments + -collagesstarted + -collagescontr + -reqfilled + -reqvoted + -uploaded + -unique + -perfect + -seeding + -leeching + -snatched + -invited + -artistsadded + + +# Artist + + * artist.getArtistReleases + * artist.getArtistImage + * artist.getArtistInfo + * artist.getArtistTags + * artist.getArtistSimilar + * artist.getArtistRequests + + + artist.setArtistInfo + + +# Torrent + + * torrent.getTorrentParentId + * torrent.getTorrentDownloadURL + * torrent.getTorrentDetails + * torrent.getTorrentSize + * torrent.getTorrentSnatched + * torrent.getTorrentSeeders + * torrent.getTorrentLeechers + * torrent.getTorrentUploadedBy + * torrent.getTorrentFolderName + * torrent.getTorrentFileList + * torrent.getTorrentDescription + * torrent.getTorrentComments + * torrent.isTorrentFreeLeech + * torrent.isTorrentReported + + +# Authenticate + + * authenticate.getAuthenticatedUserId + * authenticate.getAuthenticatedUserAuthCode + * authenticate.getAuthenticatedUserDownload + * authenticate.getAuthenticatedUserUpload() + * authenticate.getAuthenticatedUserRatio + * authenticate.getAuthenticatedUserRequiredRatio + +""" + +class ResponseBody: + """A Response Body Object""" + pass + +class SpecificInformation: + """A Specific Information Object""" + pass + + +class WhatBase(object): + """An abstract webservices object.""" + whatcd = None + + def __init__(self, whatcd): + self.whatcd = whatcd + #if we are not autenticated in what.cd, do it now + if not self.whatcd.isAuthenticated(): + print "authenticating..." + self.whatcd.headers = Authenticate(self.whatcd).getAuthenticatedHeader() + + def _request(self, type, path, data, headers): + return Request(self.whatcd, type, path, data, headers) + + def _parser(self): + return Parser(self.whatcd) + + def utils(self): + return Utils() + + +class Utils(): + + def md5(self, text): + """Returns the md5 hash of a string.""" + + h = hashlib.md5() + h.update(self._string(text)) + + return h.hexdigest() + + def _unicode(self, text): + if type(text) == unicode: + return text + + if type(text) == int: + return unicode(text) + + return unicode(text, "utf-8") + + def _string(self, text): + if type(text) == str: + return text + + if type(text) == int: + return str(text) + + return text.encode("utf-8") + + def _number(self, string): + """ + Extracts an int from a string. Returns a 0 if None or an empty string was passed + """ + + if not string: + return 0 + elif string == "": + return 0 + else: + try: + return int(string) + except ValueError: + return float(string) + + def substituteEntity(self, match): + ent = match.group(2) + if match.group(1) == "#": + return unichr(int(ent)) + else: + cp = n2cp.get(ent) + + if cp: + return unichr(cp) + else: + return match.group() + + def decodeHTMLEntities(self, string): + entity_re = re.compile("&(#?)(\d{1,5}|\w{1,8});") + return entity_re.subn(self.substituteEntity, string)[0] + + + +class WhatCD(object): + + def __init__(self, username, password, site, loginpage, headers): + + #credentials + self.username = username + self.password = password + self.site = site + self.loginpage = loginpage + self.headers = headers + self.authenticateduserinfo = {} + + self.cache_backend = None + self.proxy_enabled = False + self.proxy = None + + def isAuthenticated(self): + """ + Checks if we are authenticated in what.cd + """ + if "id" in self.authenticateduserinfo: + return True + else: + return False + + def getCredentials(self): + """ + Returns an authenticated user credentials object + """ + return Authenticate(self) + + + def getUser(self, username): + """ + Returns an user object + """ + return User(username, self) + + def getTorrent(self, id, page=1): + """ + Returns a torrent object + """ + return Torrent(id, page, None, self) + + def getTorrentGroup(self, id, page=1): + """ + Returns a torrent object + """ + return Torrent(id, page, True, self) + + def getArtist(self, name): + """ + Returns an artist object + """ + return Artist(name, self) + + def enableProxy(self, host, port): + """Enable a default web proxy""" + self.proxy = [host, Utils()._number(port)] + self.proxy_enabled = True + + def disableProxy(self): + """Disable using the web proxy""" + self.proxy_enabled = False + + def isProxyEnabled(self): + """Returns True if a web proxy is enabled.""" + return self.proxy_enabled + + def getProxy(self): + """Returns proxy details.""" + return self.proxy + + def enableCaching(self, file_path=None): + """Enables caching request-wide for all cachable calls. + * file_path: A file path for the backend storage file. If + None set, a temp file would probably be created, according the backend. + """ + if not file_path: + file_path = tempfile.mktemp(prefix="whatapi_tmp_") + + self.cache_backend = _ShelfCacheBackend(file_path) + + def disableCaching(self): + """Disables all caching features.""" + self.cache_backend = None + + def isCachingEnabled(self): + """Returns True if caching is enabled.""" + + return not (self.cache_backend == None) + + def getCacheBackend(self): + + return self.cache_backend + +def getWhatcdNetwork(username="", password=""): + """ + Returns a preconfigured WhatCD object for what.cd + # Parameters: + * username str: a username of a valid what.cd user + * password str: user's password + """ + + return WhatCD ( + username=username, + password=password, + site="ssl.what.cd", + loginpage="/login.php", + headers={ + "Content-type": "application/x-www-form-urlencoded", + 'Accept-Charset': 'utf-8', + 'User-Agent': "whatapi [devilcius]" + }) + + + +class _ShelfCacheBackend(object): + """Used as a backend for caching cacheable requests.""" + def __init__(self, file_path=None): + self.shelf = shelve.open(file_path) + + def getHTML(self, key): + return self.shelf[key] + + def setHTML(self, key, xml_string): + self.shelf[key] = xml_string + + def hasKey(self, key): + return key in self.shelf.keys() + + +class Request(object): + """web service operation.""" + + def __init__(self, whatcd, type, path, data, headers): + + self.whatcd = whatcd + self.utils = Utils() + self.type = type + self.path = path + self.data = data + self.headers = headers + #enable catching? + if whatcd.isCachingEnabled(): + self.cache = whatcd.getCacheBackend() + + def getCacheKey(self): + """The cache key is a md5 hash of request params.""" + + key = self.type + self.path + self.data + return Utils().md5(key) + + def getCachedResponse(self): + """Returns a file object of the cached response.""" + + if not self.isCached(): + response = self.downloadResponse() + self.cache.setHTML(self.getCacheKey(), response) + return self.cache.getHTML(self.getCacheKey()) + + def isCached(self): + """Returns True if the request is already in cache.""" + + return self.cache.hasKey(self.getCacheKey()) + + def downloadResponse(self): + """Returns a ResponseBody object from the server.""" + + #print "downloading from %s" % (self.path) + conn = httplib.HTTPSConnection(self.whatcd.site) + rb = ResponseBody() + + if self.whatcd.isProxyEnabled(): + conn = httplib.HTTPSConnection(host=self.whatcd.getProxy()[0], port=self.whatcd.getProxy()[1]) + conn.request(method=self.type, url="https://" + self.whatcd.site + self.path, body=self.data, headers=self.headers) + else: + conn.request(self.type, self.path, self.data, self.headers) + + response = conn.getresponse() + rb.headers = response.getheaders() + # Rip all inline JavaScript out of the response in case it hasn't been properly escaped + rb.body = re.sub('', '', response.read()) + conn.close() + return rb + + def execute(self, cacheable=False): + """Depending if caching is enabled, returns response from the server or, if available, the cached response""" + if self.whatcd.isCachingEnabled() and cacheable: + response = self.getCachedResponse() + else: + response = self.downloadResponse() + + return response + +class Authenticate(WhatBase): + + def __init__(self, whatcd): + """Create an authenticated user object. + # Parameters: + * whatcd object: WhatCD object. + """ + self.whatcd = whatcd + self.parser = Parser(whatcd) + if not self.whatcd.isAuthenticated(): + self.getAuthenticatedHeader() + + def setCookie(self): + print "creating cookie" + f = open('cookie', 'w') + loginform = {'username': self.whatcd.username, 'password': self.whatcd.password\ + , 'keeplogged': '1', 'login': 'Login'} + data = urllib.urlencode(loginform) + response = self._request("POST", self.whatcd.loginpage, data, self.whatcd.headers).execute(True) + try: + cookie = dict(response.headers)['set-cookie'] + session = re.search("session=[^;]+", cookie).group(0) + self.whatcd.headers["Cookie"] = session + homepage = response.body + pickle.dump(self.whatcd.headers, f) + except (KeyError, AttributeError): + print "Login failed, most likely bad creds or the site is down, nothing to do" + f.close() + os.remove('cookie') + self.whatcd.headers = None + quit() + f.close() + + + def getAuthenticatedHeader(self): + """ + Log user in what.cd and returns the authenticated header + """ + homepage = None + if os.path.exists("cookie"): + f = open("cookie", "r") + try: + self.whatcd.headers = pickle.load(f) + except EOFError: + f.close() + os.remove("cookie") + print "invalid cookie, removed" + self.setCookie() + else: + self.setCookie() + #set authenticated user info + if 'id' not in self.whatcd.authenticateduserinfo: + self.whatcd.authenticateduserinfo = self.getAuthenticatedUserInfo(homepage) + + return self.whatcd.headers + + def getAuthenticatedUserInfo(self, homepage=None): + """ + Returns authenticated user's info + """ + if not homepage: + homepage = BeautifulSoup(self._request("GET", "/index.php", "", self.whatcd.headers).execute(True).body) + authuserinfo = self._parser().authenticatedUserInfo(homepage.find("div", {"id": "userinfo"})) + return authuserinfo + + def getAuthenticatedUserId(self): + """ + Returns authenticated user's id + """ + return self.whatcd.authenticateduserinfo["id"] + + def getAuthenticatedUserAuthCode(self): + """ + Returns authenticated user's authcode + """ + return self.whatcd.authenticateduserinfo["authcode"] + + + def getAuthenticatedUserUpload(self): + """ + Returns authenticated user's total uploaded data + """ + return self.whatcd.authenticateduserinfo["uploaded"] + + + def getAuthenticatedUserDownload(self): + """ + Returns authenticated user's total downloaded data + """ + return self.whatcd.authenticateduserinfo["downloaded"] + + + def getAuthenticatedUserRatio(self): + """ + Returns authenticated user's ratio + """ + return self.whatcd.authenticateduserinfo["ratio"] + + def getAuthenticatedUserRequiredRatio(self): + """ + Returns authenticated user's required ratio + """ + return self.whatcd.authenticateduserinfo["required"] + + +class User(WhatBase): + """A What.CD user""" + + def __init__(self, username, whatcd): + """Create an user object. + # Parameters: + * username str: The user's name. + - whatcd object: the what.cd network object + """ + WhatBase.__init__(self, whatcd) + self.name = username + self.whatcd = whatcd + self.userpage = "/user.php?" + self.userid = None + self.userinfo = None + + def getUserName(self): + """ + Returns user's name + """ + return self.username + + def getUserId(self): + """ + Returns user's id, None if user doesn't exists + """ + if self.userid: + return self.userid + else: + idform = {'action': "search", 'search': self.name} + data = urllib.urlencode(idform) + headers = self._request("GET", self.userpage + data, "", self.whatcd.headers).execute(True).headers + if dict(headers) is None: + return None + else: + self.userid = dict(headers)['location'][12:] + return self.userid + + def getInfo(self): + """ + Returns a dictionary of {percentile:{dataup str, + datadown str, + overall str, + postmade str, + boutyspent str, + reqfilled str, + artistsadded str, + torrentsup str}, + stats: {uploaded str, + ratio str, + joined str, + downloaded str, + lastseen str, + rratio str}, + community: {uploaded tuple(total str, url str), + forumposts tuple(total str, url str), + invited tuple (total,None), + perfectflacs tuple(total str, url str), + contributedcollages tuple(total str, url str), + reqvoted tuple(total str, url str), + uniquegroups tuple(total str, url str) + torrentscomments tuple(total str, url str), + snatched tuple(total str, url str), + artists str, + reqfilled tuple(total str, url str), + startedcollages tuple(total str, url str), + leeching tuple(total str, url str), + seeding tuple(total str, url str)} + } + If paranoia is not Off, it returns None. + """ + if self.getUserId(): + form = {'id': self.getUserId()} + data = urllib.urlencode(form) + userpage = BeautifulSoup(self._request("GET", self.userpage + data, "", self.whatcd.headers).execute(True).body) + info = self._parser().userInfo(userpage.find("div", {"class": "sidebar"}), self.name) + self.userinfo = info + return info + else: + print "no user id retrieved" + return None + + + def getTorrentsSeeding(self, page=1): + """ + Returns a list with all user's uploaded music torrents + in form of dictionary {page(tuple with current and total),tag, dlurl, id, + artist(a tuple with 1 artist name || 2 names in case of two artists || 'Various Artists' if V.A.}, + album, release type, scene, year and artistid (a tuple with 1 artist id || 2 ids if 2 artists torrent || empty if V.A.} + """ + if self.userid is None: + self.userid = self.getUserId() + url = "/torrents.php?type=seeding&userid=%s&page=%d" % (self.userid, page) + torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) + return self._parser().torrentsList(torrentspage) + + def getTorrentsSnatched(self, page=1): + """ + Returns a list with all user's uploaded music torrents + in form of dictionary {page(tuple with current and total),tag, dlurl, id, + artist(a tuple with 1 artist name || 2 names in case of two artists || 'Various Artists' if V.A.}, + album, release type, scene, year and artistid (a tuple with 1 artist id || 2 ids if 2 artists torrent || empty if V.A.} + """ + if self.userid is None: + self.userid = self.getUserId() + url = "/torrents.php?type=snatched&userid=%s&page=%d" % (self.userid, page) + torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) + return self._parser().torrentsList(torrentspage) + + def getTorrentsUploaded(self, page=1): + """ + Returns a list with all user's uploaded music torrents + in form of dictionary {page(tuple with current and total),tag, dlurl, id, + artist(a tuple with 1 artist name || 2 names in case of two artists || 'Various Artists' if V.A.}, + album, release type, scene, year and artistid (a tuple with 1 artist id || 2 ids if 2 artists torrent || empty if V.A.} + """ + if self.userid is None: + self.userid = self.getUserId() + url = "/torrents.php?type=uploaded&userid=%s&page=%d" % (self.userid, page) + torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) + return self._parser().torrentsList(torrentspage) + + + def getTorrentsCommented(self, page=1): + """ + Returns a list with all user's commented torrents + in form of dictionary {postid, torrentid, comment,postdate, pages} + + """ + if self.userid is None: + self.userid = self.getUserId() + + url = "/%s&page=%d" % (self.specificUserInfo().torrentscomments[1], page) + torrentspage = BeautifulSoup(self._request("GET", url, "", self.whatcd.headers).execute(True).body) + return self._parser().postsList(torrentspage) + + + + ############################################### + # specific values # + ############################################### + + + def specificUserInfo(self): + """ + Returns specific attributes of user info. None if user's paranoia is on + """ + info = SpecificInformation() + # Initialize attributes + info.joindate, info.lastseen, info.dataup, info.datadown,\ + info.ratio, info.rratio, info.uppercentile, info.downpercentile,\ + info.torrentsuppercentile, info.reqfilledpercentile, info.bountyspentpercentile,\ + info.postsmadepercentile, info.artistsaddedpercentile, info.overallpercentile,\ + info.postsmadecom, info.torrentscommentscom, info.collagesstartedcom, info.collagescontrcon,\ + info.reqfilledcom, info.reqvotedcom, info.uploadedcom, info.uniquecom, info.perfectcom,\ + info.seedingcom, info.leechingcom, info.snatchedcom, info.invitedcom, info.artistsaddedcom\ + = (None, None, None, None, None, None, None, None, None, None, None, None, None, None,\ + None, None, None, None, None, None, None, None, None, None, None, None, None, None) + + + if not self.userinfo and self.getInfo() is None: + pass + else: + ######## stats ########### + info.joindate = self.userinfo['stats']['joined'] + info.lastseen = self.userinfo['stats']['lastseen'] + info.dataup = self.userinfo['stats']['uploaded'] + info.datadown = self.userinfo['stats']['downloaded'] + info.ratio = self.userinfo['stats']['ratio'] + info.rratio = self.userinfo['stats']['rratio'] + ######## percentile ########### + info.uppercentile = self.userinfo['percentile']['dataup'] + info.downpercentile = self.userinfo['percentile']['datadown'] + info.torrentsuppercentile = self.userinfo['percentile']['torrentsup'] + info.reqfilledpercentile = self.userinfo['percentile']['reqfilled'] + info.bountyspentpercentile = self.userinfo['percentile']['bountyspent'] + info.postsmadepercentile = self.userinfo['percentile']['postsmade'] + info.artistsaddedpercentile = self.userinfo['percentile']['artistsadded'] + info.overallpercentile = self.userinfo['percentile']['overall'] + ######## community ########### + info.postsmadecom = self.userinfo['community']['forumposts'] + info.torrentscomments = self.userinfo['community']['torrentscomments'] + info.collagesstartedcom = self.userinfo['community']['startedcollages'] + info.collagescontrcon = self.userinfo['community']['contributedcollages'] + info.reqfilledcom = self.userinfo['community']['reqfilled'] + info.reqvotedcom = self.userinfo['community']['reqvoted'] + info.uploadedcom = self.userinfo['community']['uploaded'] + info.uniquecom = self.userinfo['community']['uniquegroups'] + info.perfectcom = self.userinfo['community']['pefectflacs'] + info.seedingcom = self.userinfo['community']['seeding'] + info.leechingcom = self.userinfo['community']['leeching'] + info.snatchedcom = self.userinfo['community']['snatched'] + info.invitedcom = self.userinfo['community']['invited'][0] + info.artistsaddedcom = self.userinfo['community']['artists'] + + + + return info + + +class Torrent(WhatBase): + """A What.CD torrent""" + + def __init__(self, id, page, isparent, whatcd): + """Create a torrent object. + # Parameters: + * id str: The torrent's id. + * whatcd object: the WhatCD network object + * page: The torrent page's number [optional] + """ + WhatBase.__init__(self, whatcd) + self.id = id + self.page = page + self.whatcd = whatcd + self.isParent = isparent + self.torrentpage = "/torrents.php?" + self.torrentinfo = self.getInfo() + + + def getTorrentUrl(self): + """ + Returns torrent's URL + """ + if self.isParent: + form = {'id': self.id, 'page':self.page} + data = urllib.urlencode(form) + return self.torrentpage + data + else: + form = {'torrentid': self.id, 'page':self.page} + data = urllib.urlencode(form) + headers = self._request("GET", self.torrentpage + data, "", self.whatcd.headers).execute(True).headers + + if dict(headers) is None: + return None + else: + if 'location' not in dict(headers).keys(): + return None + else: + return dict(headers)['location'] + + + def getInfo(self): + """ + Returns a dictionnary with torrents's info + """ + if self.getTorrentUrl() is None: + print "no torrent retrieved with such id" + return None + + torrentpage = BeautifulSoup(self._request("GET", "/" + self.getTorrentUrl(), "", self.whatcd.headers).execute(True).body) + + if 'Site log' in torrentpage.find("title").string: + print "no torrent retrieved with such id" + return None + else: + return self._parser().torrentInfo(torrentpage, self.id, self.isParent) + + + def getTorrentParentId(self): + """ + Returns torrent's group id + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['parentid'] + + def getTorrentDownloadURL(self): + """ + Returns relative url to download the torrent + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['downloadurl'] + + def getTorrentDetails(self): + """ + Returns torrent's details (format / bitrate) + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['details'] + + def getTorrentEditionInfo(self): + """ + Returns torrent's edition info (Edition information / media type) + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['editioninfo'] + + def getTorrentMediaType(self): + """ + Returns torrent's media type + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['rlsmedia'] + + def getTorrentSize(self): + """ + Returns torrent's size + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['size'] + + + def getTorrentSnatched(self): + """ + Returns torrent's total snatches + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['snatched'] + + + def getTorrentSeeders(self): + """ + Returns torrent's current seeders + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['seeders'] + + def getTorrentLeechers(self): + """ + Returns torrent's current leechers + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['leechers'] + + def getTorrentUploadedBy(self): + """ + Returns torrent's uploader + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['uploadedby'] + + def getTorrentFolderName(self): + """ + Returns torrent's folder name + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['foldername'] + + def getTorrentFileList(self): + """ + Returns torrent's file list + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['filelist'] + + + def getTorrentReleaseType(self): + """ + Returns torrent's release type + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['rlstype'] + + def getTorrentDescription(self): + """ + Returns torrent's description / empty string is there's none + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['torrentdescription'] + + def getTorrentComments(self): + """ + Returns a list of dictionnaries with each comment in the torrent page + {postid,post,userid,username} + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['comments'] + + def getTorrentCommentsPagesNumber(self): + """ + Returns number of pages of comments in the torrent + """ + if self.torrentinfo: + return self.torrentInfo['torrent']['commentspages'] + + def isTorrentFreeLeech(self): + """ + Returns True if torrent is freeleeech, False if not + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['isfreeleech'] + + def isTorrentReported(self): + """ + Returns True if torrent is reported, False if not + """ + if self.torrentinfo: + return self.torrentinfo['torrent']['isreported'] + + +class Artist(WhatBase): + """A What.CD artist""" + + def __init__(self, name, whatcd): + """Create an artist object. + # Parameters: + * name str: The artist's name. + * whatcd object: The WhatCD network object + """ + WhatBase.__init__(self, whatcd) + self.name = name + self.whatcd = whatcd + self.artistpage = "/artist.php" + self.utils = Utils() + self.info = self.getInfo() + + + def getArtistName(self): + """ + Returns artist's name + """ + return self.name + + def getArtistId(self): + """ + Returns artist's id, None if artist's not found + """ + form = {'artistname': self.name} + data = urllib.urlencode(form) + headers = self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).headers + if dict(headers)['location'][0:14] != 'artist.php?id=': + return None + else: + return dict(headers)['location'][14:] + + def getInfo(self): + """ + Returns artist's info, None if there isn't + """ + if self.getArtistId(): + form = {'id': self.getArtistId()} + data = urllib.urlencode(form) + artistpage = BeautifulSoup(self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).body) + return self._parser().artistInfo(artistpage) + else: + print "no artist info retrieved" + return None + + def getArtistReleases(self): + """ + Returns a list with all artist's releases in form of dictionary {releasetype, year, name, id} + """ + return self.info['releases'] + + def getArtistImage(self): + """ + Return the artist image URL, None if there's no image + """ + return self.info['image'] + + def getArtistInfo(self): + """ + Return the artist's info, blank string if none + """ + return self.info['info'] + + def getArtistTags(self): + """ + Return a list with artist's tags + """ + return self.info['tags'] + + def getArtistSimilar(self): + """ + Return a list with artist's similar artists + """ + return self.info['similarartists'] + + def getArtistRequests(self): + """ + Returns a list with all artist's requests in form of dictionary {requestname, id} + """ + return self.info['requests'] + + def setArtistInfo(self, id, info): + """ + Updates what.cd artist's info and image + Returns 1 if artist info updated succesfully, 0 if not. + # Parameters: + * id str: what.cd artist's id + * info tuple: (The artist's info -str-, image url -str- (None if there isn't)) + """ + if info[0]: + params = {'action': 'edit', 'artistid':id} + data = urllib.urlencode(params) + + edit_page = BeautifulSoup(self._request("GET", self.artistpage + "?" + data, "", self.whatcd.headers).execute(True).body) + what_form = self._parser().whatForm(edit_page, 'edit') + if info[1]: + image_to_post = info[1] + else: + image_to_post = what_form['image'] + data_to_post = {'body': info[0].encode('utf-8'), + 'summary':'automated artist info insertion',\ + 'image':image_to_post,\ + 'artistid':what_form['artistid'],\ + 'auth':what_form['auth'],\ + 'action':what_form['action']} + + #post artist's info + self.whatcd.headers['Content-type'] = "application/x-www-form-urlencoded" + response = self._request("POST", self.artistpage, urllib.urlencode(data_to_post), self.whatcd.headers).execute(False) + artist_id_returned = dict(response.headers)['location'][14:] + + if str(artist_id_returned) == str(what_form['artistid']): + return 1 + else: + return 0 + + else: + return 'no artist info provided. Aborting.' + exit() + + +class Parser(object): + + def __init__(self, whatcd): + self.utils = Utils() + self.whatcd = whatcd + self.totalpages = 0 + + def authenticatedUserInfo(self, dom): + """ + Parse the index page and returns a dictionnary with basic authenticated user information + """ + userInfo = {} + soup = BeautifulSoup(str(dom)) + for ul in soup.fetch('ul'): + if ul["id"] == "userinfo_username": + #retrieve user logged id + hrefid = ul.findAll('li')[0].find("a")["href"] + regid = re.compile('[0-9]+') + if regid.search(hrefid) is None: + self.debugMessage("not found href to retrieve user id") + else: + userInfo["id"] = regid.search(hrefid).group(0) + print "User id: %s" % userInfo["id"] + + #retrieve user logged id + hrefauth = ul.findAll('li')[2].find("a")["href"] + print hrefauth + regauth = re.compile('=[0-9a-zA-Z]+') + if regid.search(hrefid) is None: + self.debugMessage("not found href to retrieve user id") + else: + userInfo["authcode"] = regauth.search(hrefauth).group(0)[1:] + + elif ul["id"] == "userinfo_stats": + if len(ul.findAll('li')) > 0: + userInfo["uploaded"] = ul.findAll('li')[0].find("span").string + userInfo["downloaded"] = ul.findAll('li')[1].find("span").string + userInfo["ratio"] = ul.findAll('li')[2].findAll("span")[1].string + userInfo["required"] = ul.findAll('li')[3].find("span").string + userInfo["authenticate"] = True + + return userInfo + + def userInfo(self, dom, user): + """ + Parse an user's page and returns a dictionnary with its information + + # Parameters: + * dom str: user page html + * user str: what.cd username + """ + userInfo = {'stats':{}, 'percentile':{}, 'community':{}} + soup = BeautifulSoup(str(dom)) + + for div in soup.fetch('div', {'class':'box'}): + + #if paronoia is not set to 'Off', stop collecting data + if div.findAll('div')[0].string == "Personal": + if div.find('ul').findAll('li')[1].contents[1].string.strip() != "Off": + return None + + statscontainer = soup.findAll('div', {'class':'box'})[1] + percentilecontainer = soup.findAll('div', {'class':'box'})[2] + communitycontainer = soup.findAll('div', {'class':'box'})[4] + + + userInfo['stats']['joined'] = statscontainer.findAll('li')[0].find('span')['title'] + userInfo['stats']['lastseen'] = statscontainer.findAll('li')[1].find('span')['title'] + userInfo['stats']['uploaded'] = statscontainer.findAll('li')[2].string[10:] + userInfo['stats']['downloaded'] = statscontainer.findAll('li')[3].string[12:] + userInfo['stats']['ratio'] = statscontainer.findAll('li')[4].find('span').string + userInfo['stats']['rratio'] = statscontainer.findAll('li')[5].string[16:] + userInfo['percentile']['dataup'] = percentilecontainer.findAll('li')[0].string[15:] + userInfo['percentile']['datadown'] = percentilecontainer.findAll('li')[1].string[17:] + userInfo['percentile']['torrentsup'] = percentilecontainer.findAll('li')[2].string[19:] + userInfo['percentile']['reqfilled'] = percentilecontainer.findAll('li')[3].string[17:] + userInfo['percentile']['bountyspent'] = percentilecontainer.findAll('li')[4].string[14:] + userInfo['percentile']['postsmade'] = percentilecontainer.findAll('li')[5].string[12:] + userInfo['percentile']['artistsadded'] = percentilecontainer.findAll('li')[6].string[15:] + userInfo['percentile']['overall'] = percentilecontainer.findAll('li')[7].find('strong').string[14:] + + userInfo['community']['forumposts'] = (communitycontainer.findAll('li')[0].contents[0].string[13:len(communitycontainer.findAll('li')[0].contents[0].string)-2],\ + communitycontainer.findAll('li')[0].find('a')['href']) + userInfo['community']['torrentscomments'] = (communitycontainer.findAll('li')[1].contents[0].string[18:len(communitycontainer.findAll('li')[1].contents[0].string)-2],\ + communitycontainer.findAll('li')[1].find('a')['href']) + userInfo['community']['startedcollages'] = (communitycontainer.findAll('li')[2].contents[0].string[18:len(communitycontainer.findAll('li')[2].contents[0].string)-2],\ + communitycontainer.findAll('li')[2].find('a')['href']) + userInfo['community']['contributedcollages'] = (communitycontainer.findAll('li')[3].contents[0].string[25:len(communitycontainer.findAll('li')[3].contents[0].string)-2],\ + communitycontainer.findAll('li')[3].find('a')['href']) + userInfo['community']['reqfilled'] = (communitycontainer.findAll('li')[4].contents[0].string[17:len(communitycontainer.findAll('li')[4].contents[0].string)-2],\ + communitycontainer.findAll('li')[4].find('a')['href']) + userInfo['community']['reqvoted'] = (communitycontainer.findAll('li')[5].contents[0].string[16:len(communitycontainer.findAll('li')[5].contents[0].string)-2],\ + communitycontainer.findAll('li')[5].find('a')['href']) + userInfo['community']['uploaded'] = (communitycontainer.findAll('li')[6].contents[0].string[10:len(communitycontainer.findAll('li')[6].contents[0].string)-2],\ + communitycontainer.findAll('li')[6].find('a')['href']) + userInfo['community']['uniquegroups'] = (communitycontainer.findAll('li')[7].contents[0].string[15:len(communitycontainer.findAll('li')[7].contents[0].string)-2],\ + communitycontainer.findAll('li')[7].find('a')['href']) + userInfo['community']['pefectflacs'] = (communitycontainer.findAll('li')[8].contents[0].string[16:len(communitycontainer.findAll('li')[8].contents[0].string)-2],\ + communitycontainer.findAll('li')[8].find('a')['href']) + userInfo['community']['seeding'] = (communitycontainer.findAll('li')[9].contents[0].string[9:len(communitycontainer.findAll('li')[9].contents[0].string)-2],\ + communitycontainer.findAll('li')[9].find('a')['href']) + userInfo['community']['leeching'] = (communitycontainer.findAll('li')[10].contents[0].string[10:len(communitycontainer.findAll('li')[10].contents[0].string)-2],\ + communitycontainer.findAll('li')[10].find('a')['href']) + #NB: there's a carriage return and white spaces inside the snatched li tag + userInfo['community']['snatched'] = (communitycontainer.findAll('li')[11].contents[0].string[10:len(communitycontainer.findAll('li')[11].contents[0].string)-7],\ + communitycontainer.findAll('li')[11].find('a')['href']) + userInfo['community']['invited'] = (communitycontainer.findAll('li')[12].contents[0].string[9:],\ + None) + userInfo['community']['artists'] = percentilecontainer.findAll('li')[6]['title'] + + return userInfo + + def torrentInfo(self, dom, id, isparent): + """ + Parse a torrent's page and returns a dictionnary with its information + """ + + torrentInfo = {'torrent':{}} + torrentfiles = [] + torrentdescription = "" + isreported = False + isfreeleech = False + soup = BeautifulSoup(str(dom)) + if isparent: + torrentInfo['torrent']['parentid'] = id + else: + groupidurl = soup.findAll('div', {'class':'linkbox'})[0].find('a')['href'] + torrentInfo['torrent']['editioninfo'] = soup.findAll('td', {'class':'edition_info'})[0].find('strong').contents[-1] + regrlsmedia = re.compile('CD|DVD|Vinyl|Soundboard|SACD|Cassette|WEB|Blu-ray') + torrentInfo['torrent']['rlsmedia'] = regrlsmedia.search(torrentInfo['torrent']['editioninfo']).group(0) + torrentInfo['torrent']['parentid'] = groupidurl[groupidurl.rfind("=") + 1:] + torrentInfo['torrent']['downloadurl'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a', {'title':'Download'})[0]['href'] + ## is freeleech or/and reported? ## + #both + if len(soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents) == 4: + isreported = True + isfreeleech = True + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + #either + elif len(soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents) == 2: + if soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[1].string == 'Reported': + isreported = True + elif soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[1].string == 'Freeleech!': + isreported = True + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + #none + else: + torrentInfo['torrent']['details'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('a')[-1].contents[0] + torrentInfo['torrent']['isfreeleech'] = isfreeleech + torrentInfo['torrent']['isreported'] = isreported + torrentInfo['torrent']['size'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[1].string + torrentInfo['torrent']['snatched'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[2].string + torrentInfo['torrent']['seeders'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[3].string + torrentInfo['torrent']['leechers'] = soup.findAll('tr', {'id':'torrent%s' % id})[0].findAll('td')[4].string + torrentInfo['torrent']['uploadedby'] = soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('a')[0].string + foldername = soup.findAll('div', {'id':'files_%s' % id})[0].findAll('div')[1].string + if(foldername is None): + torrentInfo['torrent']['foldername'] = None + else: + torrentInfo['torrent']['foldername'] = self.utils.decodeHTMLEntities(foldername) + files = soup.findAll('div', {'id':'files_%s' % id})[0].findAll('tr') + for file in files[1:-1]: + torrentfiles.append(self.utils.decodeHTMLEntities(file.contents[0].string)) + torrentInfo['torrent']['filelist'] = torrentfiles + #is there any description? + if len(soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('blockquote')) > 1: + description = torrentInfo['torrent']['description'] = soup.findAll('tr', {'id':'torrent_%s' % id})[0].findAll('blockquote')[1].contents + info = '' + for content in description: + if content.string: + info = "%s%s" % (info, self.utils._string(content.string)) + torrentdescription = "%s%s" % (torrentdescription, self.utils._string(content.string)) + torrentInfo['torrent']['torrentdescription'] = torrentdescription + regrlstype = re.compile('Album|Soundtrack|EP|Anthology|Compilation|DJ Mix|Single|Live album|Remix|Bootleg|Interview|Mixtape|Unknown') + torrentInfo['torrent']['rlstype'] = regrlstype.search(soup.find('div', {'class':'thin'}).find('h2').contents[1]).group(0) + + torrentInfo['torrent']['comments'] = [] + torrentInfo['torrent']['commentspages'] = 0 + + if len(soup.findAll('table', {'class':'forum_post box vertical_margin'})) > 0: + linkbox = dom.findAll("div", {"class": "linkbox"})[-1] + pages = 1 + postid = '' + userid = '' + post = '' + # if there's more than 1 page of torrents + if linkbox.find("a"): + # by default torrent page show last page of comments + lastpage = linkbox.findAll("a")[-1]['href'] + pages = int(lastpage[18:lastpage.find('&')]) + 1 + for comment in soup.findAll('table', {'class':'forum_post box vertical_margin'}): + postid = comment.find("a", {"class":"post_id"}).string[1:] + userid = comment.findAll("a")[1]['href'][12:] + username = comment.findAll("a")[1].string + post = comment.find("div", {"id":"content" + postid}) + post = u''.join([post.string for post in post.findAll(text=True)]) + torrentInfo['torrent']['comments'].append({"postid":postid, "post":post, "userid":userid, "username":username}) + + torrentInfo['torrent']['commentspages'] = pages + + return torrentInfo + + def artistInfo(self, dom): + """ + Parse an artist's page and returns a dictionnary with its information + """ + artistInfo = {} + releases = [] + requests = [] + infoartist = "" + tagsartist = [] + similarartists = [] + soup = BeautifulSoup(str(dom)) + for releasetype in soup.fetch('table', {'class':'torrent_table'}): + releasetypenames = releasetype.findAll('strong') + releasetypename = releasetype.findAll('strong')[0].string + for release in releasetypenames[1:-1]: + #skip release edition info and Freeleech! s + if len(release.parent.contents) > 1 and len(release.contents) > 1: + releaseyear = release.contents[0][0:4] + releasename = release.contents[1].string + releasehref = release.contents[1]['href'] + releaseid = releasehref[releasehref.rfind('=') + 1:] + releases.append({'releasetype':releasetypename,\ + 'year': releaseyear, 'name':self.utils.decodeHTMLEntities(releasename), 'id':releaseid}) + + artistInfo['releases'] = releases + #is there an artist image? + artistInfo['image'] = None + if soup.find('div', {'class':'box'}).find('img'): + artistInfo['image'] = soup.find('div', {'class':'box'}).find('img')['src'] + #is there any artist info? + contents = soup.find('div', {'class':'body'}).contents + if len(contents) > 0: + for content in contents: + if content.string: + infoartist = "%s%s" % (infoartist, self.utils._string(content.string)) + artistInfo['info'] = self.utils.decodeHTMLEntities(infoartist) + #is there any artist tags? + if soup.findAll('ul', {'class':'stats nobullet'})[0].findAll('li'): + ul = soup.findAll('ul', {'class':'stats nobullet'})[0].findAll('li') + for li in ul: + if li.contents[0].string: + tagsartist.append(self.utils._string(li.contents[0].string)) + artistInfo['tags'] = tagsartist + #is there any similar artist? + if soup.findAll('ul', {'class':'stats nobullet'})[2].findAll('span', {'title':'2'}): + artists = soup.findAll('ul', {'class':'stats nobullet'})[2].findAll('span', {'title':'2'}) + for artist in artists: + if artist.contents[0].string: + similarartists.append(self.utils._string(artist.contents[0].string)) + artistInfo['similarartists'] = similarartists + #is there any request? + if soup.find('table', {'id':'requests'}): + for request in soup.find('table', {'id':'requests'}).findAll('tr', {'class':re.compile('row')}): + requests.append({'requestname':request.findAll('a')[1].string, 'id':request.findAll('a')[1]['href'][28:]}) + + artistInfo['requests'] = requests + + return artistInfo + + def torrentsList(self, dom): + """ + Parse a torrent's list page and returns a dictionnary with its information + """ + torrentslist = [] + torrentssoup = dom.find("table", {"width": "100%"}) + pages = 0 + + #if there's at least 1 torrent in the list + if torrentssoup: + navsoup = dom.find("div", {"class": "linkbox"}) + pages = 1 + regyear = re.compile('\[\d{4}\]') + + #is there a page navigation bar? + if navsoup.contents: + #if there's more than 1 page of torrents + if navsoup.contents[-1].has_key('href'): + lastpage = navsoup.contents[-1]['href'] + pages = lastpage[18:lastpage.find('&')] + self.totalpages = pages + else: #we are at the last page, no href + pages = self.totalpages + 1 + #fetch all tr except first one (column head) + for torrent in torrentssoup.fetch('tr')[1:]: + #exclude non music torrents + if torrent.find('td').find('div')['class'][0:10] == 'cats_music': + + torrenttag = torrent.find('td').contents[1]['title'] + torrentdl = torrent.findAll('td')[1].find('span').findAll('a')[0]['href'] + torrentrm = torrent.findAll('td')[1].find('span').findAll('a')[1]['href'] + torrentid = torrentrm[torrentrm.rfind('=') + 1:] + torrenttd = torrent.findAll('td')[1] + + # remove dataless elements + torrenttags = torrenttd.div + rightlinks = torrenttd.span + torrenttags.extract() + rightlinks.extract() + + # remove line breaks + torrenttd = "".join([line.strip() for line in str(torrenttd).split("\n")]) + torrenttd = BeautifulSoup(torrenttd) + isScene = False + info = "" + + if len(torrenttd.findAll('a')) == 2: + #one artist + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.find("a").string),) + artistid = (torrenttd.find("a")['href'][14:],) + torrentalbum = torrenttd.findAll("a")[1].string + info = torrenttd.findAll("a")[1].nextSibling.string.strip() + + + elif len(torrenttd.findAll('a')) == 1: + #various artists + torrentartist = ('Various Artists',) + artistid = () + torrentalbum = torrenttd.find("a").string + info = torrenttd.find("a").nextSibling.string.strip() + + elif len(torrenttd.findAll('a')) == 3: + #two artists + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[0].string),\ + self.utils.decodeHTMLEntities(torrenttd.findAll("a")[1].string)) + artistid = (torrenttd.findAll("a")[0]['href'][14:],\ + torrenttd.findAll("a")[1]['href'][14:]) + torrentalbum = torrenttd.findAll("a")[2].string + info = torrenttd.findAll("a")[2].nextSibling.string.strip() + + elif torrenttd.find(text=re.compile('performed by')): + #performed by + torrentartist = (self.utils.decodeHTMLEntities(torrenttd.findAll("a")[-2].string),) + artistid = (torrenttd.findAll("a")[-2]['href'][14:],) + torrentalbum = torrenttd.findAll("a")[-1].string + info = torrenttd.findAll("a")[-1].nextSibling.string.strip() + + if 'Scene' in info: + isScene = True + + torrentyear = regyear.search(info).group(0)[1:5] + torrentslist.append({'tag':torrenttag,\ + 'dlurl':torrentdl,\ + 'id':torrentid,\ + 'artist':torrentartist,\ + 'artistid':artistid,\ + 'album':self.utils.decodeHTMLEntities(torrentalbum), + 'year':torrentyear, + 'pages':pages, + 'scene':isScene}) + + return torrentslist + + def postsList(self, dom): + """ + Parse a post list page and returns a dictionnary with each post information: + {torrentid, commentid, postid} + """ + postslist = [] + postssoup = dom.find("div", {"class": "thin"}) + pages = 0 + + #if there's at least 1 post in the list + if postssoup: + navsoup = dom.find("div", {"class": "linkbox"}) + + #if there's more than 1 page of torrents + if navsoup.find("a"): + lastpage = navsoup.findAll("a")[1]['href'] + pages = lastpage[18:lastpage.find('&')] + self.totalpages = pages + else: #we are at the last page, no link + pages = 1 + + for post in postssoup.fetch('table', {'class':'forum_post box vertical_margin'}): + commentbody = post.find("td", {"class":"body"}) + postid = post.find("span").findAll("a")[0].string[1:] + torrentid = post.find("span").findAll("a")[-1]['href'][post.find("span").findAll("a")[-1]['href'].rfind('=') + 1:] + comment = u''.join([commentbody.string for commentbody in commentbody.findAll(text=True)]) + postdate = post.find("span", {"class":"time"})['title'] + postslist.append({'postid':postid,\ + 'torrentid':torrentid,\ + 'comment':comment,\ + 'postdate':postdate,\ + 'pages':pages}) + + + return postslist + + + def whatForm(self, dom, action): + """ + Parse a what.cd edit page and returns a dict with all form inputs/textareas names and values + # Parameters: + * dom str: the edit page dom. + + action str: the action value from the requested form + """ + inputs = {} + + form = dom.find('input', {'name':'action', 'value':action}).parent + elements = form.fetch(('input', 'textarea')) + #get all form elements except for submit input + for element in elements[0:3]: + name = element.get('name', None) + if element.name == 'textarea': + inputs[name] = element.string + else: + inputs[name] = element.get('value', None) + return inputs + + + +if __name__ == "__main__": + print "Module to manage what.cd as a web service"