parse_tweet.py
#!/usr/bin/env python
import re
from twitter import TwitterError # import not used?
class Emoticons:
POSITIVE = ["*O","*-*","*O*","*o*","* *",
":P",":D",":d",":p",
";P",";D",";d",";p",
":-)",";-)",":=)",";=)",
":<)",":>)",";>)",";=)",
"=}",":)","(:;)",
"(;",":}","{:",";}",
"{;:]",
"[;",":')",";')",":-3",
"{;",":]",
";-3",":-x",";-x",":-X",
";-X",":-}",";-=}",":-]",
";-]",":-.)",
"^_^","^-^"]
NEGATIVE = [":(",";(",":'(",
"=(","={","):",");",
")':",")';",")=","}=",
";-{{",";-{",":-{{",":-{",
":-(",";-(",
":,)",":'{",
"[:",";]"
]
class ParseTweet(object):
# compile once on import
regexp = {"RT": "^RT", "MT": r"^MT", "ALNUM": r"(@[a-zA-Z0-9_]+)",
"HASHTAG": r"(#[\w\d]+)", "URL": r"([https://|http://]?[a-zA-Z\d\/]+[\.]+[a-zA-Z\d\/\.]+)",
"SPACES":r"\s+"}
regexp = dict((key, re.compile(value)) for key, value in regexp.items())
def __init__(self, timeline_owner, tweet):
""" timeline_owner : twitter handle of user account. tweet - 140 chars from feed; object does all computation on construction
properties:
RT, MT - boolean
URLs - list of URL
Hashtags - list of tags
"""
self.Owner = timeline_owner
self.tweet = tweet
self.UserHandles = ParseTweet.getUserHandles(tweet)
self.Hashtags = ParseTweet.getHashtags(tweet)
self.URLs = ParseTweet.getURLs(tweet)
self.RT = ParseTweet.getAttributeRT(tweet)
self.MT = ParseTweet.getAttributeMT(tweet)
self.Emoticon = ParseTweet.getAttributeEmoticon(tweet)
# additional intelligence
if ( self.RT and len(self.UserHandles) > 0 ): # change the owner of tweet?
self.Owner = self.UserHandles[0]
return
def __str__(self):
""" for display method """
return "owner %s, urls: %d, hashtags %d, user_handles %d, len_tweet %d, RT = %s, MT = %s" % (
self.Owner, len(self.URLs), len(self.Hashtags), len(self.UserHandles), len(self.tweet), self.RT, self.MT)
@staticmethod
def getAttributeEmoticon(tweet):
""" see if tweet is contains any emoticons, +ve, -ve or neutral """
emoji = list()
for tok in re.split(ParseTweet.regexp["SPACES"],tweet.strip()):
if tok in Emoticons.POSITIVE:
emoji.append( tok )
continue
if tok in Emoticons.NEGATIVE:
emoji.append( tok )
return emoji
@staticmethod
def getAttributeRT(tweet):
""" see if tweet is a RT """
return re.search(ParseTweet.regexp["RT"], tweet.strip()) != None
@staticmethod
def getAttributeMT(tweet):
""" see if tweet is a MT """
return re.search(ParseTweet.regexp["MT"], tweet.strip()) != None
@staticmethod
def getUserHandles(tweet):
""" given a tweet we try and extract all user handles in order of occurrence"""
return re.findall(ParseTweet.regexp["ALNUM"], tweet)
@staticmethod
def getHashtags(tweet):
""" return all hashtags"""
return re.findall(ParseTweet.regexp["HASHTAG"], tweet)
@staticmethod
def getURLs(tweet):
""" URL : [http://]?[\w\.?/]+"""
return re.findall(ParseTweet.regexp["URL"], tweet)