From dd32941500b21bb3eb747aee13a6bfd1a2523f81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 15:39:19 +0100 Subject: [PATCH 1/6] Added script to pull user data from twitter API --- README.md | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 3ed342b..9ecb8e9 100644 --- a/README.md +++ b/README.md @@ -23,21 +23,31 @@ Once you have cloned the repo you're ready to rock: 2. Change into the Discursive directory (i.e. `cd discursive/`). -3. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings. +3. If not installed yet on your EC2 instance, install pip. For Ubuntu: `sudo apt-get install python2-pip` -4. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials. +4. Install requirements by running `pip2 install -r requirements.txt` -5. Put your desired keyword(s) in the `topics.txt` file (one term per line). +5. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials. -6. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. +6. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings. -7. Run `sudo docker build -t discursive .` +7. Put your desired keyword(s) in the `topics.txt` file (one term per line). -8. Run `sudo docker run discursive` +8. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. -9. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console). +9. Run `sudo docker build -t discursive .` -10. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue! +10. Run `sudo docker run discursive` + +11. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console). + +12. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue! + +## Configuring discursive for AWS + +### aws_config.py + +1. If you not already have done so, (create a IAM)[] user for programmatic access. ## Explore Twitter networks From 5a45afa21801d234d10556558cdf77230d668d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 15:41:34 +0100 Subject: [PATCH 2/6] Added Twitter User Indexing script --- config/essetup.py | 14 ++++++++ crontab | 1 + index_user_profiles.py | 75 ++++++++++++++++++++++++++++++++++++++++++ users.txt | 2 ++ 4 files changed, 92 insertions(+) create mode 100644 index_user_profiles.py create mode 100644 users.txt diff --git a/config/essetup.py b/config/essetup.py index e1a2069..b6c7eec 100644 --- a/config/essetup.py +++ b/config/essetup.py @@ -35,6 +35,20 @@ 'original_id': {'type': 'string'}, 'original_name': {'type': 'string'} } + }, + 'users': { + 'id': {'type': 'long'}, + 'name': {'type': 'string'}, + 'screen_name': {'type': 'string'}, + 'followers_count': {'type': 'long'}, + 'friends_count': {'type': 'long'}, + 'location': {'type': 'string'}, + 'description': {'type': 'string'}, + 'favorites_count': {'type': 'long'}, + 'statuses_count': {'type': 'long'}, + 'listed_count': {'type': 'long'}, + 'profile_background_image_url': {'type': 'string'}, + 'profile_image_url': {'type': 'string'} } } } diff --git a/crontab b/crontab index 6db029b..92b6495 100644 --- a/crontab +++ b/crontab @@ -1 +1,2 @@ 0,15,30,45 * * * * python /discursive/index_twitter_stream.py /discursive/topics.txt +0 * * * * python /discursive/index_user_profiles.py /discursive/users.txt diff --git a/index_user_profiles.py b/index_user_profiles.py new file mode 100644 index 0000000..9b2ac4c --- /dev/null +++ b/index_user_profiles.py @@ -0,0 +1,75 @@ +import json +import tweepy +from config import esconn, aws_config, twitter_config +import os +from datetime import datetime as dt +from config import s3conn + +# unicode mgmt +import sys +reload(sys) +sys.setdefaultencoding('utf8') + +# Twitter auth and api call setup +auth = tweepy.OAuthHandler(twitter_config.CONSUMER_KEY, twitter_config.CONSUMER_SECRET) +auth.set_access_token(twitter_config.ACCESS_TOKEN, twitter_config.ACCESS_TOKEN_SECRET) +api = tweepy.API(auth) + +# Get elasticsearch connection +es = esconn.esconn() + +if len(sys.argv) > 2: + sys.exit('ERROR: Received 2 or more arguments: {} {} {} Expected 1: User file name'.format(sys.argv[0], sys.argv[1], sys.argv[2])) + +elif len(sys.argv) == 2: + try: + with open(sys.argv[1]) as f: + users = f.readlines() + except Exception: + sys.exit('ERROR: Expected user file %s not found' % sys.argv[1]) +else: + try: + with open('users.txt') as f: + users = f.readlines() + except: + sys.exit('ERROR: Default users.txt not found. No alternate topic file was provided') + + +USERS = [user.replace('\n', '').strip().replace('@', '') for user in users] + +def retrieve_user_data(): + try: + return api.lookup_users(user_ids=USERS) + except tweepy.TweepError as e: + sys.exit("An error occured looking up the user_ids. Verify the correctness and existance of the given screen names, handles or ids.") + +def map_user_for_es(user, time_stamp): + return { + 'timestamp': time_stamp, + 'id': user.id, + 'name': user.screen_name, + 'screen_name': user.screen_name, + 'followers_count': user.followers_count, + 'friends_count': user.friends_count, + 'location': user.location, + 'description': user.description, + 'favorites_count': user.favorites_count, + 'statuses_count': user.statuses_count, + 'listed_count': user.listed_count, + 'profile_background_image_url': user.profile_background_image_url, + 'profile_image_url': user.profile_image_url + } + +def dump_to_elastic(bodydata): + es.index(index='twitter', doc_type="users", body=bodydata) + +def get_time_stamp(): + return dt.now() + +def get_twitter_users_pipeline(): + time_stamp = get_time_stamp() + user_data = retrieve_user_data() + mapped_user_data = map_user_for_es(user_data, time_stamp) + dump_to_elastic(mapped_user_data) + +get_twitter_users_pipeline() \ No newline at end of file diff --git a/users.txt b/users.txt new file mode 100644 index 0000000..9a24dd4 --- /dev/null +++ b/users.txt @@ -0,0 +1,2 @@ +@realDonaldTrump +@Google \ No newline at end of file From e93de93c7426924c289efd58277274c010395dbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 15:42:03 +0100 Subject: [PATCH 3/6] Revert "Added script to pull user data from twitter API" This reverts commit dd32941500b21bb3eb747aee13a6bfd1a2523f81. --- README.md | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/README.md b/README.md index 9ecb8e9..3ed342b 100644 --- a/README.md +++ b/README.md @@ -23,31 +23,21 @@ Once you have cloned the repo you're ready to rock: 2. Change into the Discursive directory (i.e. `cd discursive/`). -3. If not installed yet on your EC2 instance, install pip. For Ubuntu: `sudo apt-get install python2-pip` +3. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings. -4. Install requirements by running `pip2 install -r requirements.txt` +4. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials. -5. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials. +5. Put your desired keyword(s) in the `topics.txt` file (one term per line). -6. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings. +6. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. -7. Put your desired keyword(s) in the `topics.txt` file (one term per line). +7. Run `sudo docker build -t discursive .` -8. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. +8. Run `sudo docker run discursive` -9. Run `sudo docker build -t discursive .` +9. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console). -10. Run `sudo docker run discursive` - -11. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console). - -12. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue! - -## Configuring discursive for AWS - -### aws_config.py - -1. If you not already have done so, (create a IAM)[] user for programmatic access. +10. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue! ## Explore Twitter networks From c8a6e3ae7bb3d2e596c88ad3e264e2fe14e1f545 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 19:04:20 +0100 Subject: [PATCH 4/6] bugfixes --- index_user_profiles.py | 7 ++++--- users.txt | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/index_user_profiles.py b/index_user_profiles.py index 9b2ac4c..80314a4 100644 --- a/index_user_profiles.py +++ b/index_user_profiles.py @@ -35,7 +35,7 @@ sys.exit('ERROR: Default users.txt not found. No alternate topic file was provided') -USERS = [user.replace('\n', '').strip().replace('@', '') for user in users] +USERS = [user.replace('\n', '').strip() for user in users] def retrieve_user_data(): try: @@ -69,7 +69,8 @@ def get_time_stamp(): def get_twitter_users_pipeline(): time_stamp = get_time_stamp() user_data = retrieve_user_data() - mapped_user_data = map_user_for_es(user_data, time_stamp) - dump_to_elastic(mapped_user_data) + for user in user_data: + mapped_user_data = map_user_for_es(user_data, time_stamp) + dump_to_elastic(mapped_user_data) get_twitter_users_pipeline() \ No newline at end of file diff --git a/users.txt b/users.txt index 9a24dd4..87dedbc 100644 --- a/users.txt +++ b/users.txt @@ -1,2 +1,2 @@ -@realDonaldTrump -@Google \ No newline at end of file +25073877 +20536157 \ No newline at end of file From cd56b446365665817b8e1bbeec029694f81ce0c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 20:01:22 +0100 Subject: [PATCH 5/6] -Typo fixed -fixed followup bug --- config/essetup.py | 2 +- index_user_profiles.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/config/essetup.py b/config/essetup.py index b6c7eec..f6acfc2 100644 --- a/config/essetup.py +++ b/config/essetup.py @@ -44,7 +44,7 @@ 'friends_count': {'type': 'long'}, 'location': {'type': 'string'}, 'description': {'type': 'string'}, - 'favorites_count': {'type': 'long'}, + 'favourites_count': {'type': 'long'}, 'statuses_count': {'type': 'long'}, 'listed_count': {'type': 'long'}, 'profile_background_image_url': {'type': 'string'}, diff --git a/index_user_profiles.py b/index_user_profiles.py index 80314a4..48df0f5 100644 --- a/index_user_profiles.py +++ b/index_user_profiles.py @@ -53,7 +53,7 @@ def map_user_for_es(user, time_stamp): 'friends_count': user.friends_count, 'location': user.location, 'description': user.description, - 'favorites_count': user.favorites_count, + 'favourites_count': user.favourites_count, 'statuses_count': user.statuses_count, 'listed_count': user.listed_count, 'profile_background_image_url': user.profile_background_image_url, @@ -70,7 +70,7 @@ def get_twitter_users_pipeline(): time_stamp = get_time_stamp() user_data = retrieve_user_data() for user in user_data: - mapped_user_data = map_user_for_es(user_data, time_stamp) + mapped_user_data = map_user_for_es(user, time_stamp) dump_to_elastic(mapped_user_data) get_twitter_users_pipeline() \ No newline at end of file From 8b385d2e147cf5b5afbde947bafc663b79ebd278 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= Date: Tue, 28 Feb 2017 20:06:49 +0100 Subject: [PATCH 6/6] Changed the timing to daily --- crontab | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crontab b/crontab index 92b6495..4dcdf13 100644 --- a/crontab +++ b/crontab @@ -1,2 +1,2 @@ 0,15,30,45 * * * * python /discursive/index_twitter_stream.py /discursive/topics.txt -0 * * * * python /discursive/index_user_profiles.py /discursive/users.txt +0 0 * * * python /discursive/index_user_profiles.py /discursive/users.txt