#!/usr/bin/env python2.4 import sys import feedparser reload(sys) sys.setdefaultencoding('utf-8') encodings = [ 'windows-1251', 'cp1251', 'koi8-r', 'koi8-u', ] encunicodes = [ 'utf-8', 'utf8', 'utf-16', 'utf16', 'ucs-2', ] badlangs = [ 'zh' ] encbonusValue = 1000 langbonusValue = -1000 rules = {} Debug = False def bonus(reason, value): ## XXX Hack for russian letters count length = 40 + reason.count('\xd0') + reason.count('\xd1') if Debug: print " %-*s | %10d" % ( length, reason, value) return value def encBonus(enc): if enc in encodings: return bonus('Good encoding (%s)' % enc, encbonusValue) elif enc in encunicodes: return bonus('Unicode encoding (%s), no bonuses' % enc, 0) else: return reason('Bad encoding (%s)' % enc, -encbonusValue) def langBonus(lang): rb = 0 for pattern in badlangs: if lang.count(pattern) > 0: rb += bonus('Bonus for language pattern "%s" (lang="%s")' % (pattern, lang), langbonusValue) return rb def getRules(): global rules for rule in [x.strip().split('>>',2) for x in open('rules.txt').readlines() if len(x) > 1]: if len(rule) != 2: continue (rule, weight) = (rule[1].strip(), int(rule[0])) rule = rule [ rule.index('"') + 1 : rule.rindex('"') ] rules[rule] = weight def main(args): global Debug for url in args: if url == '-d': Debug = True continue if Debug: print "> Parsing %s" % url fl = feedparser.parse(url) points = encBonus(fl.encoding.lower()) try: points += langBonus(fl.feed.language.lower()) except: pass text = '\n'.join( [ item.summary_detail.value for item in fl.entries if item.has_key('summary_detail')]) text += '\n'.join( [ item.title_detail.value for item in fl.entries if item.has_key('title_detail' )]) for key in rules.keys(): cnt = text.count(key) increment = cnt * rules[key] if increment != 0 : points += increment points += bonus("for key %s (%d nums)" % (key, cnt), increment) if points < 0: print "Feed: %s not in Russian (weight: %d)" % (url, points) else: print "Feed: %s is good (weight: %d)" % (url, points) if __name__ == '__main__': getRules() main(sys.argv[1:])