Raw File
parse_tweet.py
#!/usr/bin/env python

import re

from twitter import TwitterError  # import not used?

class Emoticons:
    POSITIVE = ["*O","*-*","*O*","*o*","* *",
                ":P",":D",":d",":p",
                ";P",";D",";d",";p",
                ":-)",";-)",":=)",";=)",
                ":<)",":>)",";>)",";=)",
                "=}",":)","(:;)",
                "(;",":}","{:",";}",
                "{;:]",
                "[;",":')",";')",":-3",
                "{;",":]",
                ";-3",":-x",";-x",":-X",
                ";-X",":-}",";-=}",":-]",
                ";-]",":-.)",
                "^_^","^-^"]

    NEGATIVE = [":(",";(",":'(",
                "=(","={","):",");",
                ")':",")';",")=","}=",
                ";-{{",";-{",":-{{",":-{",
                ":-(",";-(",
                ":,)",":'{",
                "[:",";]"
                ]

class ParseTweet(object):
    # compile once on import
    regexp = {"RT": "^RT", "MT": r"^MT", "ALNUM": r"(@[a-zA-Z0-9_]+)",
              "HASHTAG": r"(#[\w\d]+)", "URL": r"([https://|http://]?[a-zA-Z\d\/]+[\.]+[a-zA-Z\d\/\.]+)",
              "SPACES":r"\s+"}
    regexp = dict((key, re.compile(value)) for key, value in regexp.items())

    def __init__(self, timeline_owner, tweet):
        """ timeline_owner : twitter handle of user account. tweet - 140 chars from feed; object does all computation on construction
            properties:
            RT, MT - boolean
            URLs - list of URL
            Hashtags - list of tags
        """
        self.Owner = timeline_owner
        self.tweet = tweet
        self.UserHandles = ParseTweet.getUserHandles(tweet)
        self.Hashtags = ParseTweet.getHashtags(tweet)
        self.URLs = ParseTweet.getURLs(tweet)
        self.RT = ParseTweet.getAttributeRT(tweet)
        self.MT = ParseTweet.getAttributeMT(tweet)
        self.Emoticon = ParseTweet.getAttributeEmoticon(tweet)
        
        # additional intelligence
        if ( self.RT and len(self.UserHandles) > 0 ):  # change the owner of tweet?
            self.Owner = self.UserHandles[0]
        return

    def __str__(self):
        """ for display method """
        return "owner %s, urls: %d, hashtags %d, user_handles %d, len_tweet %d, RT = %s, MT = %s" % (
        self.Owner, len(self.URLs), len(self.Hashtags), len(self.UserHandles), len(self.tweet), self.RT, self.MT)

    @staticmethod
    def getAttributeEmoticon(tweet):
        """ see if tweet is contains any emoticons, +ve, -ve or neutral """
        emoji = list()
        for tok in re.split(ParseTweet.regexp["SPACES"],tweet.strip()):
            if tok in Emoticons.POSITIVE:
                emoji.append( tok )
                continue
            if tok in Emoticons.NEGATIVE:
                emoji.append( tok )
        return emoji
    
    @staticmethod
    def getAttributeRT(tweet):
        """ see if tweet is a RT """
        return re.search(ParseTweet.regexp["RT"], tweet.strip()) != None

    @staticmethod
    def getAttributeMT(tweet):
        """ see if tweet is a MT """
        return re.search(ParseTweet.regexp["MT"], tweet.strip()) != None

    @staticmethod
    def getUserHandles(tweet):
        """ given a tweet we try and extract all user handles in order of occurrence"""
        return re.findall(ParseTweet.regexp["ALNUM"], tweet)

    @staticmethod
    def getHashtags(tweet):
        """ return all hashtags"""
        return re.findall(ParseTweet.regexp["HASHTAG"], tweet)

    @staticmethod
    def getURLs(tweet):
        """ URL : [http://]?[\w\.?/]+"""
        return re.findall(ParseTweet.regexp["URL"], tweet)
back to top