swh:1:snp:abb1cef73095ab682f673d37376fa97c3597e515
Tip revision: 58339950480a3a4e1bfa6c90a1faccecb97db3f5 authored by John Beieler on 15 August 2016, 17:42:17 UTC
Now with more proxy.
Now with more proxy.
Tip revision: 5833995
mongo_connection.py
import json
import requests
import datetime
from kafka import SimpleProducer, KafkaClient
def add_entry(collection, text, text_feats, title, url, date, website, lang):
"""
Function that creates the dictionary of content to add to a MongoDB
instance and inserts the information into an external data store.
Parameters
----------
collection : pymongo Collection.
Collection within MongoDB that in which results are stored.
text : String.
Text from a given webpage.
text_feats : Dict.
Features returned by the hermes API.
title : String.
Title of the news story.
url : String.
URL of the webpage from which the content was pulled.
date : String.
Date pulled from the RSS feed.
website : String.
Nickname of the site from which the content was pulled.
Returns
-------
object_id : String
"""
to_insert = make_entry(collection, text, text_feats, title, url, date,
website, lang)
object_id = collection.insert(to_insert)
json_friendly = to_insert
json_friendly['_id'] = str(json_friendly['_id'])
if 'mitie_info' in json_friendly.keys():
for key in json_friendly['mitie_info'].keys():
json_friendly['mitie_info'][key] = json.dumps(json_friendly['mitie_info'][key])
id_str = json_friendly['_id']
json_friendly['date'] = str(json_friendly['date'])
json_friendly['date_added'] = json_friendly['date_added'].strftime("%Y-%m-%dT%H:%M:%S")
json_friendly = json.dumps(json_friendly)
# Send to Elasticsearch
base_url = 'http://52.6.147.254:9200/api/stories/'
url = base_url + '{}/_create'.format(id_str)
print('\tSending to ES.')
out = requests.put(url, data=json_friendly)
if out.status_code != 201:
print('\tError sending to ES.')
print(out.status_code)
print(out.json())
# Send "ISIL-related" stories to XDATA
# Keywords defined by Uncharted
keywords = ['isil', 'eiil', 'isis', 'islamic', 'taliban', 'qaeda',
'caliphate', 'daesh']
if any([x in text for x in keywords]):
print('\tSending to Kafka...')
kafka = KafkaClient('k01.istresearch.com:9092')
producer = SimpleProducer(kafka)
producer.send_messages("caerus-news", json_friendly)
return object_id
def make_entry(collection, text, text_feats, title, url, date, website, lang):
"""
Function that creates the dictionary of content to add to an external data
store.
Parameters
----------
text : String.
Text from a given webpage.
title : String.
Title of the news story.
url : String.
URL of the webpage from which the content was pulled.
date : String.
Date pulled from the RSS feed.
website : String.
Nickname of the site from which the content was pulled.
Returns
-------
to_inser : Dictionary.
Dictionary of text and other content.
"""
if lang == 'arabic':
toInsert = {"url": url,
"title": title,
"source": website,
"date": date,
"date_added": datetime.datetime.utcnow(),
"content_ar": text,
"content_en": '',
"stanford": 0,
"geo": 0,
"language": lang}
elif lang == 'english':
if text_feats:
# No stanford for now...
# try:
# trees = []
# stanford = text_feats['stanford']['sentences']
# full_stanford = text_feats['stanford']
# for i in xrange(len(stanford)):
# trees.append(stanford[i]['parsetree'])
# stanford_coded = 1
# except TypeError:
# full_stanford = {}
# stanford_coded = 0
trees = []
full_stanford = {}
stanford_coded = 0
mitie_info = text_feats['MITIE']
if 'status' in mitie_info.keys():
mitie_info = {}
else:
for key in mitie_info.keys():
mitie_info[key] = json.loads(mitie_info[key])
geo_info = text_feats['CLIFF']
mordecai_info = text_feats['mordecai']
topic_info = json.loads(text_feats['topic_model'])
good_text_feats = 1
else:
trees = []
stanford_coded = 0
mitie_info = {}
geo_info = {}
topic_info = {}
full_stanford = {}
good_text_feats = 0
toInsert = {"url": url,
"title": title,
"source": website,
"date": date,
"date_added": datetime.datetime.utcnow(),
"content_en": text,
"content_ar": '',
"stanford": stanford_coded,
"good_text_feats": good_text_feats,
"mitie_info": mitie_info,
"geo_info": geo_info,
"mordecai_info": mordecai_info,
"topic_info": topic_info,
"full_stanford": full_stanford,
"parsed_sents": trees,
"language": lang}
return toInsert