##
## File: assignment10-solutions.py (STAT 3250)
## Topic: Assignment 10 Solutions
##
## For this assignment you will be working with Twitter data related
## to the season opening of Game of Thrones on April 14, XXXXXXXXXXYou will use
## a set of over 10,000 tweets for this purpose. The data is in the file
## 'GoTtweets.txt'. The code below can be used to import the data into
## a list, with each list element a dict of the tweet object.
## Note: On this assignment it makes sense to use loops to extract
## information from the tweets. Go wild.
## The Gradescope autograder will be evaluating your code on a reduced
## version of the GoTtweets.txt data that includes only a fraction of the
## records. Your code needs to automatically handle all assignments
## to the variables q1, q2, ... to accommodate the reduced data set,
## so do not copy/paste things from the console window, and take care
## with hard-coding values.
import numpy as np # load numpy as np
import pandas as pd # load pandas as pd
import json
import re
tweetlist = []
for line in open('GoTtweets.txt', 'r'):
tweetlist.append(json.loads(line))
## 1. The tweets were downloaded in several groups at about the same time.
## Are there any that appear in the file more than once? Give a Series
## with the tweet ID for any repeated tweets as the index and the number
## of times each ID appears in the file as values. Sort by the index from
## smallest to largest.
q1 = None # Series of tweet IDs that appear > 1 time
## Note: For the remaining questions in this assignment, do not wo
y about
## any duplicate tweets. Just answer the questions based on the
## existing data set.
## 2. Determine the number of tweets that include 'Daenerys' (any combination
## of upper and lower case) in the text of the tweet. Then do the same
## for 'Snow'.
q2a = None # number of tweets including 'daenerys'
q2b = None # number of tweets including 'snow'
## 3. Find the average number of hashtags included in the tweets. (You may get
## the wrong answer if you use the text of the tweets instead of the
## hashtag lists.)
q3 = None # average number of hashtags per tweet
## 4. Determine the tweets that have 0 hashtags, 1 hashtag, 2 hashtags,
## and so on. Give your answer as a Series with the number of hashtags
## as index (sorted smallest to largest) and the co
esponding number of
## tweets as values. Include in your Series index only number of hashtags
## that occur for at least one tweet. (Note: See warning in #3)
q4 = None # Series of number of hashtags and counts
## 5. Determine the number of tweets that include the hashtag '#GoT', then
## repeat for '#GameofThrones'. (You may get the wrong answer if you
## use the text of the tweets instead of the hashtag lists.)
## Note: Hashtags are not case sensitive, so any of '#GOT', '#got', 'GOt'
## etc are all considered matches.
q5a = None # number of tweets with '#GoT' hashtag and uppe
lower variants
q5b = None # number of tweets with '#GameofThrones' hashtags and uppe
lower variants
## 6. Some tweeters like to tweet a lot. Find the screen name for all
## tweeters with at least 3 tweets in this data. Give a Series with
## the screen name (in lower case) as index and the number of tweets as
## value, sorting by the index in alphbetical order.
q6 = None # Series of screen name and counts
## 7. Among the screen names with 3 or more tweets, find the average
## 'followers_count' for each and then give a table with the screen
## and average number of followers. (Note that the number of
## followers might change from tweet to tweet.) Give a Series with
## screen name (in lower case) as index and the average number of followers
## as value, sorting by the index in alphbetical order.
q7 = None # Series of screen names and mean follower counts
## 8. Determine the hashtags that appeared in at least 50 tweets. Give
## a Series with the hashtags (lower case) as index and the co
esponding
## number of tweets as values, sorted alphabetically by hashtag.
q8 = None # Series of hashtags and counts
## 9. Some of the tweets include the location of the tweeter. Give a Series
## of the names of countries with at least three tweets, with country
## name as index and co
esponding tweet count as values. Sort the
## Series alphabetically by country name.
q9 = None # Series of countries with at least three tweets
## Questions 10-11: The remaining questions should be done using regular
## XXXXXXXXXXexpressions as described in the class lectures.
## 10. Determine the percentage of tweets (if any) with a sequence of 3 or more
## consecutive digits. (No spaces between the digits!) For such tweets,
## apply 'split()' to create a list of substrings. Among all the
## substrings with a sequence of at least three consecutive digits,
## determine the percentage where the substring starts with a '@' at the
## beginning of the substring.
q10a = None # percentage of tweets with three consecutive digits
q10b = None # percentage starting with @ among substrings with 3 consec digits
## 11. Determine if there are any cases of a tweet with a 'hashtag' that is
## actually not a hashtag because there is a character (letter or digit)
## immediately before the "#". An example would be 'nota#hashtag'.
## Count the number of tweets with such an inco
ect 'hashtag'.
q11 = None # count of tweets with bad hashtag
{"created_at":"Mon Apr 15 01:59: XXXXXXXXXX","id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","text":"y\u2019all are sick\ud83d\ude2d\ud83d\ude2d\ud83d\ude2d\ud83d\ude2d","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","name":"riann.","screen_name":"riannchanel","location":null,"url":null,"description":"University of Michigan \u201823|| black hermione granger \u26a1\ufe0f","translator_type":"none","protected":false,"verified":false,"followers_count":376,"friends_count":508,"listed_count":18,"favourites_count":77144,"statuses_count":29543,"created_at":"Mon Jul 06 23:56: XXXXXXXXXX","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\
g.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\
g.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/WvbnYU1D_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/WvbnYU1D_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/ XXXXXXXXXX\/ XXXXXXXXXX","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id": XXXXXXXXXX,"quoted_status_id_str":" XXXXXXXXXX","quoted_status":{"created_at":"Mon Apr 15 01:56: XXXXXXXXXX","id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","text":"Sam: cant wait to see my family\n\nDaenerys: \n\n#GameofThrones https:\/\/t.co\/kK8IQsu1ul","display_text_range":[0,59],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","name":"joonie\u2019s","screen_name":"nayooniee","location":"moon","url":"https:\/\/archiveofourown.org\/users\/nayooniee","description":"this is not a soft account.","translator_type":"none","protected":false,"verified":false,"followers_count":592,"friends_count":587,"listed_count":2,"favourites_count":11469,"statuses_count":5630,"created_at":"Sun Jan 14 14:29: XXXXXXXXXX","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/LB5SQhi1_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/LB5SQhi1_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/ XXXXXXXXXX\/ XXXXXXXXXX","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":3,"reply_count":0,"retweet_count":142,"favorite_count":389,"entities":{"hashtags":[{"text":"GameofThrones","indices":[45,59]}],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","indices":[60,83],"media_url":"http:\/\/pbs.twimg.com\/media\/D4KKHuMXoAMBuwb.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/D4KKHuMXoAMBuwb.jpg","url":"https:\/\/t.co\/kK8IQsu1ul","display_url":"pic.twitter.com\/kK8IQsu1ul","expanded_url":"https:\/\/twitter.com\/nayooniee\/status\/ XXXXXXXXXX\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":530,"h":578,"resize":"fit"},"large":{"w":530,"h":578,"resize":"fit"},"small":{"w":530,"h":578,"resize":"fit"}}}]},"extended_entities":{"media":[{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","indices":[60,83],"media_url":"http:\/\/pbs.twimg.com\/media\/D4KKHuMXoAMBuwb.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/D4KKHuMXoAMBuwb.jpg","url":"https:\/\/t.co\/kK8IQsu1ul","display_url":"pic.twitter.com\/kK8IQsu1ul","expanded_url":"https:\/\/twitter.com\/nayooniee\/status\/ XXXXXXXXXX\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":530,"h":578,"resize":"fit"},"large":{"w":530,"h":578,"resize":"fit"},"small":{"w":530,"h":578,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_permalink":{"url":"https:\/\/t.co\/xpPSFZxUij","expanded":"https:\/\/twitter.com\/nayooniee\/status\/ XXXXXXXXXX","display":"twitter.com\/nayooniee\/stat\u2026"},"is_quote_status":true,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":" XXXXXXXXXX"}
{"created_at":"Mon Apr 15 01:59: XXXXXXXXXX","id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","text":"Me hice pip\u00ed #got","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","name":"Jos\u00e9 Cristaldo","screen_name":"chuy94cristaldo","location":null,"url":null,"description":"Hincha #1 del Club Ce
o Porte\u00f1o","translator_type":"none","protected":false,"verified":false,"followers_count":61,"friends_count":92,"listed_count":0,"favourites_count":1170,"statuses_count":367,"created_at":"Tue Jul 09 11:26: XXXXXXXXXX","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\
g.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\
g.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/qYELCfOE_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/qYELCfOE_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/ XXXXXXXXXX\/ XXXXXXXXXX","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"got","indices":[13,17]}],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"es","timestamp_ms":" XXXXXXXXXX"}
{"created_at":"Mon Apr 15 01:59: XXXXXXXXXX","id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","text":"#GameofThrones jajaj el encuentro de Bran y Jaime","source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id": XXXXXXXXXX,"id_str":" XXXXXXXXXX","name":"mmmmmm","screen_name":"erikjanina","location":null,"url":null,"description":"\ud83c\udf41\ud83c\udf42No eres li
e cuando haces lo que quieres, eres li
e cuando expresas lo q eres \ud83d\udc9a.......La m\u00fasica es m\u00ed terapia \ud83c\udfb5\ud83c\udfa7","translator_type":"none","protected":false,"verified":false,"followers_count":163,"friends_count":425,"listed_count":3,"favourites_count":6392,"statuses_count":19642,"created_at":"Mon May 27 03:22: XXXXXXXXXX","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"es","contributors_enabled":false,"is_translator":false,"profile_background_color":"24D2DE","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme4\
g.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme4\
g.gif","profile_background_tile":true,"profile_link_color":"009AB9","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/Ype2pF_h_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/ XXXXXXXXXX\/Ype2pF_h_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/ XXXXXXXXXX\/ XXXXXXXXXX","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"GameofThrones","indices":[0,14]}],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"es","timestamp_ms":" XXXXXXXXXX"}
{"created_at":"Mon Apr 15 01:59: XXXXXXXXXX"