From dd32941500b21bb3eb747aee13a6bfd1a2523f81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 15:39:19 +0100
Subject: [PATCH 1/6] Added script to pull user data from twitter API

---
 README.md | 26 ++++++++++++++++++--------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 3ed342b..9ecb8e9 100644
--- a/README.md
+++ b/README.md
@@ -23,21 +23,31 @@ Once you have cloned the repo you're ready to rock:
 
 2. Change into the Discursive directory (i.e. `cd discursive/`).
 
-3. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings.
+3. If not installed yet on your EC2 instance, install pip. For Ubuntu: `sudo apt-get install python2-pip`
 
-4. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials.
+4. Install requirements by running `pip2 install -r requirements.txt`
 
-5. Put your desired keyword(s) in the `topics.txt` file (one term per line).
+5. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials.
 
-6. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. 
+6. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings.
 
-7. Run `sudo docker build -t discursive .`
+7. Put your desired keyword(s) in the `topics.txt` file (one term per line).
 
-8. Run `sudo docker run discursive`
+8. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. 
 
-9. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console).
+9. Run `sudo docker build -t discursive .`
 
-10. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue!
+10. Run `sudo docker run discursive`
+
+11. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console).
+
+12. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue!
+
+## Configuring discursive for AWS
+
+### aws_config.py
+
+1. If you not already have done so, (create a IAM)[] user for programmatic access.
 
 ## Explore Twitter networks
 

From 5a45afa21801d234d10556558cdf77230d668d6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 15:41:34 +0100
Subject: [PATCH 2/6] Added Twitter User Indexing script

---
 config/essetup.py      | 14 ++++++++
 crontab                |  1 +
 index_user_profiles.py | 75 ++++++++++++++++++++++++++++++++++++++++++
 users.txt              |  2 ++
 4 files changed, 92 insertions(+)
 create mode 100644 index_user_profiles.py
 create mode 100644 users.txt

diff --git a/config/essetup.py b/config/essetup.py
index e1a2069..b6c7eec 100644
--- a/config/essetup.py
+++ b/config/essetup.py
@@ -35,6 +35,20 @@
                 'original_id': {'type': 'string'},
                 'original_name': {'type': 'string'}
             }
+        },
+        'users': {
+            'id': {'type': 'long'},
+            'name': {'type': 'string'},
+            'screen_name': {'type': 'string'},
+            'followers_count': {'type': 'long'},
+            'friends_count': {'type': 'long'},
+            'location': {'type': 'string'},
+            'description': {'type': 'string'},
+            'favorites_count': {'type': 'long'},
+            'statuses_count': {'type': 'long'},
+            'listed_count': {'type': 'long'},
+            'profile_background_image_url': {'type': 'string'},
+            'profile_image_url': {'type': 'string'}
         }
     }
 }
diff --git a/crontab b/crontab
index 6db029b..92b6495 100644
--- a/crontab
+++ b/crontab
@@ -1 +1,2 @@
 0,15,30,45 * * * * python /discursive/index_twitter_stream.py /discursive/topics.txt
+0 * * * * python /discursive/index_user_profiles.py /discursive/users.txt
diff --git a/index_user_profiles.py b/index_user_profiles.py
new file mode 100644
index 0000000..9b2ac4c
--- /dev/null
+++ b/index_user_profiles.py
@@ -0,0 +1,75 @@
+import json
+import tweepy
+from config import esconn, aws_config, twitter_config
+import os
+from datetime import datetime as dt
+from config import s3conn
+
+# unicode mgmt
+import sys
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+# Twitter auth and api call setup
+auth = tweepy.OAuthHandler(twitter_config.CONSUMER_KEY, twitter_config.CONSUMER_SECRET)
+auth.set_access_token(twitter_config.ACCESS_TOKEN, twitter_config.ACCESS_TOKEN_SECRET)
+api = tweepy.API(auth)
+
+# Get elasticsearch connection
+es = esconn.esconn()
+
+if len(sys.argv) > 2:
+    sys.exit('ERROR: Received 2 or more arguments: {} {} {} Expected 1: User file name'.format(sys.argv[0], sys.argv[1], sys.argv[2]))
+
+elif len(sys.argv) == 2:
+    try:
+        with open(sys.argv[1]) as f:
+            users = f.readlines()
+    except Exception:
+        sys.exit('ERROR: Expected user file %s not found' % sys.argv[1])
+else:
+    try:
+        with open('users.txt') as f:
+            users = f.readlines()
+    except:
+        sys.exit('ERROR: Default users.txt not found. No alternate topic file  was provided')
+
+
+USERS = [user.replace('\n', '').strip().replace('@', '') for user in users]
+
+def retrieve_user_data():
+    try:
+        return api.lookup_users(user_ids=USERS)
+    except tweepy.TweepError as e:
+        sys.exit("An error occured looking up the user_ids. Verify the correctness and existance of the given screen names, handles or ids.")
+
+def map_user_for_es(user, time_stamp):
+    return {
+        'timestamp': time_stamp,
+        'id': user.id,
+        'name': user.screen_name,
+        'screen_name': user.screen_name,
+        'followers_count': user.followers_count,
+        'friends_count': user.friends_count,
+        'location': user.location,
+        'description': user.description,
+        'favorites_count': user.favorites_count,
+        'statuses_count': user.statuses_count,
+        'listed_count': user.listed_count,
+        'profile_background_image_url': user.profile_background_image_url,
+        'profile_image_url': user.profile_image_url
+    }  
+
+def dump_to_elastic(bodydata):
+    es.index(index='twitter', doc_type="users", body=bodydata)
+
+def get_time_stamp():
+    return dt.now()
+
+def get_twitter_users_pipeline():
+    time_stamp = get_time_stamp()
+    user_data = retrieve_user_data()
+    mapped_user_data = map_user_for_es(user_data, time_stamp)
+    dump_to_elastic(mapped_user_data)
+
+get_twitter_users_pipeline()
\ No newline at end of file
diff --git a/users.txt b/users.txt
new file mode 100644
index 0000000..9a24dd4
--- /dev/null
+++ b/users.txt
@@ -0,0 +1,2 @@
+@realDonaldTrump
+@Google
\ No newline at end of file

From e93de93c7426924c289efd58277274c010395dbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 15:42:03 +0100
Subject: [PATCH 3/6] Revert "Added script to pull user data from twitter API"

This reverts commit dd32941500b21bb3eb747aee13a6bfd1a2523f81.
---
 README.md | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/README.md b/README.md
index 9ecb8e9..3ed342b 100644
--- a/README.md
+++ b/README.md
@@ -23,31 +23,21 @@ Once you have cloned the repo you're ready to rock:
 
 2. Change into the Discursive directory (i.e. `cd discursive/`).
 
-3. If not installed yet on your EC2 instance, install pip. For Ubuntu: `sudo apt-get install python2-pip`
+3. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings.
 
-4. Install requirements by running `pip2 install -r requirements.txt`
+4. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials.
 
-5. Update the `aws_config.py` `twitter_config.py` `esconn.py` and `s3conn.py` files located in the `/config` directory with your credentials.
+5. Put your desired keyword(s) in the `topics.txt` file (one term per line).
 
-6. Run `essetup.py` which is located in the `/config` directory, which'll generate the Elasticsearch index with the appropriate mappings.
+6. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. 
 
-7. Put your desired keyword(s) in the `topics.txt` file (one term per line).
+7. Run `sudo docker build -t discursive .`
 
-8. Edit the `crontab` file to run at your desired intervals. The default will run every fifteen minutes. 
+8. Run `sudo docker run discursive`
 
-9. Run `sudo docker build -t discursive .`
+9. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console).
 
-10. Run `sudo docker run discursive`
-
-11. If all went well you're watching Tweets stream into your Elasticsearch index! Conversely, run `index_twitter_search.py` to search for specific topic(s) and bulk insert the data into your Elasticsearch index (and see the messages from Elasticsearch returned to your console).
-
-12. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue!
-
-## Configuring discursive for AWS
-
-### aws_config.py
-
-1. If you not already have done so, (create a IAM)[] user for programmatic access.
+10. There are several options you may want to configure/tweak. For instance, you may want to turn off printing to console (which you can do in `index_twitter_search.py`) or run the container as a detached process. Please do jump into our Slack channel #assemble if you have any questions or log an issue!
 
 ## Explore Twitter networks
 

From c8a6e3ae7bb3d2e596c88ad3e264e2fe14e1f545 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 19:04:20 +0100
Subject: [PATCH 4/6] bugfixes

---
 index_user_profiles.py | 7 ++++---
 users.txt              | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/index_user_profiles.py b/index_user_profiles.py
index 9b2ac4c..80314a4 100644
--- a/index_user_profiles.py
+++ b/index_user_profiles.py
@@ -35,7 +35,7 @@
         sys.exit('ERROR: Default users.txt not found. No alternate topic file  was provided')
 
 
-USERS = [user.replace('\n', '').strip().replace('@', '') for user in users]
+USERS = [user.replace('\n', '').strip() for user in users]
 
 def retrieve_user_data():
     try:
@@ -69,7 +69,8 @@ def get_time_stamp():
 def get_twitter_users_pipeline():
     time_stamp = get_time_stamp()
     user_data = retrieve_user_data()
-    mapped_user_data = map_user_for_es(user_data, time_stamp)
-    dump_to_elastic(mapped_user_data)
+    for user in user_data:
+        mapped_user_data = map_user_for_es(user_data, time_stamp)
+        dump_to_elastic(mapped_user_data)
 
 get_twitter_users_pipeline()
\ No newline at end of file
diff --git a/users.txt b/users.txt
index 9a24dd4..87dedbc 100644
--- a/users.txt
+++ b/users.txt
@@ -1,2 +1,2 @@
-@realDonaldTrump
-@Google
\ No newline at end of file
+25073877
+20536157
\ No newline at end of file

From cd56b446365665817b8e1bbeec029694f81ce0c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 20:01:22 +0100
Subject: [PATCH 5/6] -Typo fixed -fixed followup bug

---
 config/essetup.py      | 2 +-
 index_user_profiles.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/config/essetup.py b/config/essetup.py
index b6c7eec..f6acfc2 100644
--- a/config/essetup.py
+++ b/config/essetup.py
@@ -44,7 +44,7 @@
             'friends_count': {'type': 'long'},
             'location': {'type': 'string'},
             'description': {'type': 'string'},
-            'favorites_count': {'type': 'long'},
+            'favourites_count': {'type': 'long'},
             'statuses_count': {'type': 'long'},
             'listed_count': {'type': 'long'},
             'profile_background_image_url': {'type': 'string'},
diff --git a/index_user_profiles.py b/index_user_profiles.py
index 80314a4..48df0f5 100644
--- a/index_user_profiles.py
+++ b/index_user_profiles.py
@@ -53,7 +53,7 @@ def map_user_for_es(user, time_stamp):
         'friends_count': user.friends_count,
         'location': user.location,
         'description': user.description,
-        'favorites_count': user.favorites_count,
+        'favourites_count': user.favourites_count,
         'statuses_count': user.statuses_count,
         'listed_count': user.listed_count,
         'profile_background_image_url': user.profile_background_image_url,
@@ -70,7 +70,7 @@ def get_twitter_users_pipeline():
     time_stamp = get_time_stamp()
     user_data = retrieve_user_data()
     for user in user_data:
-        mapped_user_data = map_user_for_es(user_data, time_stamp)
+        mapped_user_data = map_user_for_es(user, time_stamp)
         dump_to_elastic(mapped_user_data)
 
 get_twitter_users_pipeline()
\ No newline at end of file

From 8b385d2e147cf5b5afbde947bafc663b79ebd278 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Dr=C3=A4bing?= <thomas.draebing@gmail.com>
Date: Tue, 28 Feb 2017 20:06:49 +0100
Subject: [PATCH 6/6] Changed the timing to daily

---
 crontab | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crontab b/crontab
index 92b6495..4dcdf13 100644
--- a/crontab
+++ b/crontab
@@ -1,2 +1,2 @@
 0,15,30,45 * * * * python /discursive/index_twitter_stream.py /discursive/topics.txt
-0 * * * * python /discursive/index_user_profiles.py /discursive/users.txt
+0 0 * * * python /discursive/index_user_profiles.py /discursive/users.txt