Data Wrangling

Table of Contents

In [1]:
# import the needed libraries
import pandas as pd
import numpy as np
import requests
import zipfile
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns
plt.style.use('bmh')
from PIL import Image
from io import BytesIO
from wordcloud import WordCloud, STOPWORDS
import requests
import numpy as np
%matplotlib inline

Gather

In [2]:
# read the provided csv file twitter-archive-enhanced file (file on hand) and save it
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
In [3]:
# read the first 5 rows for data inspection
twitter_archive.head()
Out[3]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
In [4]:
# getting the image prediction file programmatically  using the given url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)

# save to .tsv file
with open('image_predictions.tsv', 'wb') as file:
    file.write(response.content)
In [5]:
# read the image prediction file and save to pandas DataFrame
image_pred = pd.read_csv('image_predictions.tsv',sep='\t')

# check for the data top 5 rows
image_pred.head()
Out[5]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True

I tried to set up a twitter developer account, but my application was not approved.

  • The following code is the Twitter API code supported by Udacity.
  • So, I will comment it as a matter of reproducibilty when rerun all the code cells in this Jupyter notebook.
In [6]:
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
#consumer_key = 'HIDDEN'
#consumer_secret = 'HIDDEN'
#access_token = 'HIDDEN'
#access_secret = 'HIDDEN'

#auth = OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)

#api = tweepy.API(auth, wait_on_rate_limit=True)


# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor

# Tweet IDs for which to gather additional data via Twitter's API
#tweet_ids = twitter_archive.tweet_id.values
#len(tweet_ids)

# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
    # This loop will likely take 20-30 minutes to run because of Twitter's rate limit
#    for tweet_id in tweet_ids:
#        count += 1
#        print(str(count) + ": " + str(tweet_id))
#        try:
#            tweet = api.get_status(tweet_id, tweet_mode='extended')
#            print("Success")
#            json.dump(tweet._json, outfile)
#            outfile.write('\n')
#        except tweepy.TweepError as e:
#            print("Fail")
#            fails_dict[tweet_id] = e
#            pass
#end = timer()
#print(end - start)
#print(fails_dict)

The data that should be gathered by the previous code is supported in the project resources by Udacity as zip file.

In [7]:
# extract the file from the zipfile
with open('tweet-json.zip','rb') as f:
    z_tweets = zipfile.ZipFile(f)
    z_tweets.extractall()

# check for the extracted file
z_tweets.namelist()
Out[7]:
['tweet-json copy']
In [8]:
# read the file in DataFrame
with open('tweet-json copy', 'r') as f:
    tweet_json = pd.read_json(f, lines= True, encoding = 'utf-8')

# check the data
tweet_json.head(3)
Out[8]:
contributors coordinates created_at display_text_range entities extended_entities favorite_count favorited full_text geo ... possibly_sensitive_appealable quoted_status quoted_status_id quoted_status_id_str retweet_count retweeted retweeted_status source truncated user
0 NaN NaN 2017-08-01 16:23:56 [0, 85] {'hashtags': [], 'symbols': [], 'user_mentions... {'media': [{'id': 892420639486877696, 'id_str'... 39467 False This is Phineas. He's a mystical boy. Only eve... NaN ... 0.0 NaN NaN NaN 8853 False NaN <a href="http://twitter.com/download/iphone" r... False {'id': 4196983835, 'id_str': '4196983835', 'na...
1 NaN NaN 2017-08-01 00:17:27 [0, 138] {'hashtags': [], 'symbols': [], 'user_mentions... {'media': [{'id': 892177413194625024, 'id_str'... 33819 False This is Tilly. She's just checking pup on you.... NaN ... 0.0 NaN NaN NaN 6514 False NaN <a href="http://twitter.com/download/iphone" r... False {'id': 4196983835, 'id_str': '4196983835', 'na...
2 NaN NaN 2017-07-31 00:18:03 [0, 121] {'hashtags': [], 'symbols': [], 'user_mentions... {'media': [{'id': 891815175371796480, 'id_str'... 25461 False This is Archie. He is a rare Norwegian Pouncin... NaN ... 0.0 NaN NaN NaN 4328 False NaN <a href="http://twitter.com/download/iphone" r... False {'id': 4196983835, 'id_str': '4196983835', 'na...

3 rows × 31 columns

In [9]:
# check for the columns names 
tweet_json.columns
Out[9]:
Index(['contributors', 'coordinates', 'created_at', 'display_text_range',
       'entities', 'extended_entities', 'favorite_count', 'favorited',
       'full_text', 'geo', 'id', 'id_str', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str', 'is_quote_status',
       'lang', 'place', 'possibly_sensitive', 'possibly_sensitive_appealable',
       'quoted_status', 'quoted_status_id', 'quoted_status_id_str',
       'retweet_count', 'retweeted', 'retweeted_status', 'source', 'truncated',
       'user'],
      dtype='object')
In [10]:
# select the columns of interest : 'id', 'favorite_count','retweet_count'
tweet_json = tweet_json.loc[:,['id','favorite_count','retweet_count']]

# check for the top 5 rows
tweet_json.head()
Out[10]:
id favorite_count retweet_count
0 892420643555336193 39467 8853
1 892177421306343426 33819 6514
2 891815181378084864 25461 4328
3 891689557279858688 42908 8964
4 891327558926688256 41048 9774

Assess

  • So, Now we have Three datasets twitter_archive , img_pred and tweet_json
  • First let's display one by one for visual assessing
In [11]:
# display twitter archive
twitter_archive
Out[11]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
0 892420643555336193 NaN NaN 2017-08-01 16:23:56 +0000 <a href="http://twitter.com/download/iphone" r... This is Phineas. He's a mystical boy. Only eve... NaN NaN NaN https://twitter.com/dog_rates/status/892420643... 13 10 Phineas None None None None
1 892177421306343426 NaN NaN 2017-08-01 00:17:27 +0000 <a href="http://twitter.com/download/iphone" r... This is Tilly. She's just checking pup on you.... NaN NaN NaN https://twitter.com/dog_rates/status/892177421... 13 10 Tilly None None None None
2 891815181378084864 NaN NaN 2017-07-31 00:18:03 +0000 <a href="http://twitter.com/download/iphone" r... This is Archie. He is a rare Norwegian Pouncin... NaN NaN NaN https://twitter.com/dog_rates/status/891815181... 12 10 Archie None None None None
3 891689557279858688 NaN NaN 2017-07-30 15:58:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Darla. She commenced a snooze mid meal... NaN NaN NaN https://twitter.com/dog_rates/status/891689557... 13 10 Darla None None None None
4 891327558926688256 NaN NaN 2017-07-29 16:00:24 +0000 <a href="http://twitter.com/download/iphone" r... This is Franklin. He would like you to stop ca... NaN NaN NaN https://twitter.com/dog_rates/status/891327558... 12 10 Franklin None None None None
5 891087950875897856 NaN NaN 2017-07-29 00:08:17 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a majestic great white breaching ... NaN NaN NaN https://twitter.com/dog_rates/status/891087950... 13 10 None None None None None
6 890971913173991426 NaN NaN 2017-07-28 16:27:12 +0000 <a href="http://twitter.com/download/iphone" r... Meet Jax. He enjoys ice cream so much he gets ... NaN NaN NaN https://gofundme.com/ydvmve-surgery-for-jax,ht... 13 10 Jax None None None None
7 890729181411237888 NaN NaN 2017-07-28 00:22:40 +0000 <a href="http://twitter.com/download/iphone" r... When you watch your owner call another dog a g... NaN NaN NaN https://twitter.com/dog_rates/status/890729181... 13 10 None None None None None
8 890609185150312448 NaN NaN 2017-07-27 16:25:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Zoey. She doesn't want to be one of th... NaN NaN NaN https://twitter.com/dog_rates/status/890609185... 13 10 Zoey None None None None
9 890240255349198849 NaN NaN 2017-07-26 15:59:51 +0000 <a href="http://twitter.com/download/iphone" r... This is Cassie. She is a college pup. Studying... NaN NaN NaN https://twitter.com/dog_rates/status/890240255... 14 10 Cassie doggo None None None
10 890006608113172480 NaN NaN 2017-07-26 00:31:25 +0000 <a href="http://twitter.com/download/iphone" r... This is Koda. He is a South Australian decksha... NaN NaN NaN https://twitter.com/dog_rates/status/890006608... 13 10 Koda None None None None
11 889880896479866881 NaN NaN 2017-07-25 16:11:53 +0000 <a href="http://twitter.com/download/iphone" r... This is Bruno. He is a service shark. Only get... NaN NaN NaN https://twitter.com/dog_rates/status/889880896... 13 10 Bruno None None None None
12 889665388333682689 NaN NaN 2017-07-25 01:55:32 +0000 <a href="http://twitter.com/download/iphone" r... Here's a puppo that seems to be on the fence a... NaN NaN NaN https://twitter.com/dog_rates/status/889665388... 13 10 None None None None puppo
13 889638837579907072 NaN NaN 2017-07-25 00:10:02 +0000 <a href="http://twitter.com/download/iphone" r... This is Ted. He does his best. Sometimes that'... NaN NaN NaN https://twitter.com/dog_rates/status/889638837... 12 10 Ted None None None None
14 889531135344209921 NaN NaN 2017-07-24 17:02:04 +0000 <a href="http://twitter.com/download/iphone" r... This is Stuart. He's sporting his favorite fan... NaN NaN NaN https://twitter.com/dog_rates/status/889531135... 13 10 Stuart None None None puppo
15 889278841981685760 NaN NaN 2017-07-24 00:19:32 +0000 <a href="http://twitter.com/download/iphone" r... This is Oliver. You're witnessing one of his m... NaN NaN NaN https://twitter.com/dog_rates/status/889278841... 13 10 Oliver None None None None
16 888917238123831296 NaN NaN 2017-07-23 00:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This is Jim. He found a fren. Taught him how t... NaN NaN NaN https://twitter.com/dog_rates/status/888917238... 12 10 Jim None None None None
17 888804989199671297 NaN NaN 2017-07-22 16:56:37 +0000 <a href="http://twitter.com/download/iphone" r... This is Zeke. He has a new stick. Very proud o... NaN NaN NaN https://twitter.com/dog_rates/status/888804989... 13 10 Zeke None None None None
18 888554962724278272 NaN NaN 2017-07-22 00:23:06 +0000 <a href="http://twitter.com/download/iphone" r... This is Ralphus. He's powering up. Attempting ... NaN NaN NaN https://twitter.com/dog_rates/status/888554962... 13 10 Ralphus None None None None
19 888202515573088257 NaN NaN 2017-07-21 01:02:36 +0000 <a href="http://twitter.com/download/iphone" r... RT @dog_rates: This is Canela. She attempted s... 8.874740e+17 4.196984e+09 2017-07-19 00:47:34 +0000 https://twitter.com/dog_rates/status/887473957... 13 10 Canela None None None None
20 888078434458587136 NaN NaN 2017-07-20 16:49:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Gerald. He was just told he didn't get... NaN NaN NaN https://twitter.com/dog_rates/status/888078434... 12 10 Gerald None None None None
21 887705289381826560 NaN NaN 2017-07-19 16:06:48 +0000 <a href="http://twitter.com/download/iphone" r... This is Jeffrey. He has a monopoly on the pool... NaN NaN NaN https://twitter.com/dog_rates/status/887705289... 13 10 Jeffrey None None None None
22 887517139158093824 NaN NaN 2017-07-19 03:39:09 +0000 <a href="http://twitter.com/download/iphone" r... I've yet to rate a Venezuelan Hover Wiener. Th... NaN NaN NaN https://twitter.com/dog_rates/status/887517139... 14 10 such None None None None
23 887473957103951883 NaN NaN 2017-07-19 00:47:34 +0000 <a href="http://twitter.com/download/iphone" r... This is Canela. She attempted some fancy porch... NaN NaN NaN https://twitter.com/dog_rates/status/887473957... 13 10 Canela None None None None
24 887343217045368832 NaN NaN 2017-07-18 16:08:03 +0000 <a href="http://twitter.com/download/iphone" r... You may not have known you needed to see this ... NaN NaN NaN https://twitter.com/dog_rates/status/887343217... 13 10 None None None None None
25 887101392804085760 NaN NaN 2017-07-18 00:07:08 +0000 <a href="http://twitter.com/download/iphone" r... This... is a Jubilant Antarctic House Bear. We... NaN NaN NaN https://twitter.com/dog_rates/status/887101392... 12 10 None None None None None
26 886983233522544640 NaN NaN 2017-07-17 16:17:36 +0000 <a href="http://twitter.com/download/iphone" r... This is Maya. She's very shy. Rarely leaves he... NaN NaN NaN https://twitter.com/dog_rates/status/886983233... 13 10 Maya None None None None
27 886736880519319552 NaN NaN 2017-07-16 23:58:41 +0000 <a href="http://twitter.com/download/iphone" r... This is Mingus. He's a wonderful father to his... NaN NaN NaN https://www.gofundme.com/mingusneedsus,https:/... 13 10 Mingus None None None None
28 886680336477933568 NaN NaN 2017-07-16 20:14:00 +0000 <a href="http://twitter.com/download/iphone" r... This is Derek. He's late for a dog meeting. 13... NaN NaN NaN https://twitter.com/dog_rates/status/886680336... 13 10 Derek None None None None
29 886366144734445568 NaN NaN 2017-07-15 23:25:31 +0000 <a href="http://twitter.com/download/iphone" r... This is Roscoe. Another pupper fallen victim t... NaN NaN NaN https://twitter.com/dog_rates/status/886366144... 12 10 Roscoe None None pupper None
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2326 666411507551481857 NaN NaN 2015-11-17 00:24:19 +0000 <a href="http://twitter.com/download/iphone" r... This is quite the dog. Gets really excited whe... NaN NaN NaN https://twitter.com/dog_rates/status/666411507... 2 10 quite None None None None
2327 666407126856765440 NaN NaN 2015-11-17 00:06:54 +0000 <a href="http://twitter.com/download/iphone" r... This is a southern Vesuvius bumblegruff. Can d... NaN NaN NaN https://twitter.com/dog_rates/status/666407126... 7 10 a None None None None
2328 666396247373291520 NaN NaN 2015-11-16 23:23:41 +0000 <a href="http://twitter.com/download/iphone" r... Oh goodness. A super rare northeast Qdoba kang... NaN NaN NaN https://twitter.com/dog_rates/status/666396247... 9 10 None None None None None
2329 666373753744588802 NaN NaN 2015-11-16 21:54:18 +0000 <a href="http://twitter.com/download/iphone" r... Those are sunglasses and a jean jacket. 11/10 ... NaN NaN NaN https://twitter.com/dog_rates/status/666373753... 11 10 None None None None None
2330 666362758909284353 NaN NaN 2015-11-16 21:10:36 +0000 <a href="http://twitter.com/download/iphone" r... Unique dog here. Very small. Lives in containe... NaN NaN NaN https://twitter.com/dog_rates/status/666362758... 6 10 None None None None None
2331 666353288456101888 NaN NaN 2015-11-16 20:32:58 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a mixed Asiago from the Galápagos... NaN NaN NaN https://twitter.com/dog_rates/status/666353288... 8 10 None None None None None
2332 666345417576210432 NaN NaN 2015-11-16 20:01:42 +0000 <a href="http://twitter.com/download/iphone" r... Look at this jokester thinking seat belt laws ... NaN NaN NaN https://twitter.com/dog_rates/status/666345417... 10 10 None None None None None
2333 666337882303524864 NaN NaN 2015-11-16 19:31:45 +0000 <a href="http://twitter.com/download/iphone" r... This is an extremely rare horned Parthenon. No... NaN NaN NaN https://twitter.com/dog_rates/status/666337882... 9 10 an None None None None
2334 666293911632134144 NaN NaN 2015-11-16 16:37:02 +0000 <a href="http://twitter.com/download/iphone" r... This is a funny dog. Weird toes. Won't come do... NaN NaN NaN https://twitter.com/dog_rates/status/666293911... 3 10 a None None None None
2335 666287406224695296 NaN NaN 2015-11-16 16:11:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an Albanian 3 1/2 legged Episcopalian... NaN NaN NaN https://twitter.com/dog_rates/status/666287406... 1 2 an None None None None
2336 666273097616637952 NaN NaN 2015-11-16 15:14:19 +0000 <a href="http://twitter.com/download/iphone" r... Can take selfies 11/10 https://t.co/ws2AMaNwPW NaN NaN NaN https://twitter.com/dog_rates/status/666273097... 11 10 None None None None None
2337 666268910803644416 NaN NaN 2015-11-16 14:57:41 +0000 <a href="http://twitter.com/download/iphone" r... Very concerned about fellow dog trapped in com... NaN NaN NaN https://twitter.com/dog_rates/status/666268910... 10 10 None None None None None
2338 666104133288665088 NaN NaN 2015-11-16 04:02:55 +0000 <a href="http://twitter.com/download/iphone" r... Not familiar with this breed. No tail (weird).... NaN NaN NaN https://twitter.com/dog_rates/status/666104133... 1 10 None None None None None
2339 666102155909144576 NaN NaN 2015-11-16 03:55:04 +0000 <a href="http://twitter.com/download/iphone" r... Oh my. Here you are seeing an Adobe Setter giv... NaN NaN NaN https://twitter.com/dog_rates/status/666102155... 11 10 None None None None None
2340 666099513787052032 NaN NaN 2015-11-16 03:44:34 +0000 <a href="http://twitter.com/download/iphone" r... Can stand on stump for what seems like a while... NaN NaN NaN https://twitter.com/dog_rates/status/666099513... 8 10 None None None None None
2341 666094000022159362 NaN NaN 2015-11-16 03:22:39 +0000 <a href="http://twitter.com/download/iphone" r... This appears to be a Mongolian Presbyterian mi... NaN NaN NaN https://twitter.com/dog_rates/status/666094000... 9 10 None None None None None
2342 666082916733198337 NaN NaN 2015-11-16 02:38:37 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a well-established sunblockerspan... NaN NaN NaN https://twitter.com/dog_rates/status/666082916... 6 10 None None None None None
2343 666073100786774016 NaN NaN 2015-11-16 01:59:36 +0000 <a href="http://twitter.com/download/iphone" r... Let's hope this flight isn't Malaysian (lol). ... NaN NaN NaN https://twitter.com/dog_rates/status/666073100... 10 10 None None None None None
2344 666071193221509120 NaN NaN 2015-11-16 01:52:02 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a northern speckled Rhododendron.... NaN NaN NaN https://twitter.com/dog_rates/status/666071193... 9 10 None None None None None
2345 666063827256086533 NaN NaN 2015-11-16 01:22:45 +0000 <a href="http://twitter.com/download/iphone" r... This is the happiest dog you will ever see. Ve... NaN NaN NaN https://twitter.com/dog_rates/status/666063827... 10 10 the None None None None
2346 666058600524156928 NaN NaN 2015-11-16 01:01:59 +0000 <a href="http://twitter.com/download/iphone" r... Here is the Rand Paul of retrievers folks! He'... NaN NaN NaN https://twitter.com/dog_rates/status/666058600... 8 10 the None None None None
2347 666057090499244032 NaN NaN 2015-11-16 00:55:59 +0000 <a href="http://twitter.com/download/iphone" r... My oh my. This is a rare blond Canadian terrie... NaN NaN NaN https://twitter.com/dog_rates/status/666057090... 9 10 a None None None None
2348 666055525042405380 NaN NaN 2015-11-16 00:49:46 +0000 <a href="http://twitter.com/download/iphone" r... Here is a Siberian heavily armored polar bear ... NaN NaN NaN https://twitter.com/dog_rates/status/666055525... 10 10 a None None None None
2349 666051853826850816 NaN NaN 2015-11-16 00:35:11 +0000 <a href="http://twitter.com/download/iphone" r... This is an odd dog. Hard on the outside but lo... NaN NaN NaN https://twitter.com/dog_rates/status/666051853... 2 10 an None None None None
2350 666050758794694657 NaN NaN 2015-11-16 00:30:50 +0000 <a href="http://twitter.com/download/iphone" r... This is a truly beautiful English Wilson Staff... NaN NaN NaN https://twitter.com/dog_rates/status/666050758... 10 10 a None None None None
2351 666049248165822465 NaN NaN 2015-11-16 00:24:50 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a 1949 1st generation vulpix. Enj... NaN NaN NaN https://twitter.com/dog_rates/status/666049248... 5 10 None None None None None
2352 666044226329800704 NaN NaN 2015-11-16 00:04:52 +0000 <a href="http://twitter.com/download/iphone" r... This is a purebred Piers Morgan. Loves to Netf... NaN NaN NaN https://twitter.com/dog_rates/status/666044226... 6 10 a None None None None
2353 666033412701032449 NaN NaN 2015-11-15 23:21:54 +0000 <a href="http://twitter.com/download/iphone" r... Here is a very happy pup. Big fan of well-main... NaN NaN NaN https://twitter.com/dog_rates/status/666033412... 9 10 a None None None None
2354 666029285002620928 NaN NaN 2015-11-15 23:05:30 +0000 <a href="http://twitter.com/download/iphone" r... This is a western brown Mitsubishi terrier. Up... NaN NaN NaN https://twitter.com/dog_rates/status/666029285... 7 10 a None None None None
2355 666020888022790149 NaN NaN 2015-11-15 22:32:08 +0000 <a href="http://twitter.com/download/iphone" r... Here we have a Japanese Irish Setter. Lost eye... NaN NaN NaN https://twitter.com/dog_rates/status/666020888... 8 10 None None None None None

2356 rows × 17 columns

In [12]:
# display image_pred
image_pred
Out[12]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh_springer_spaniel 0.465074 True collie 0.156665 True Shetland_sheepdog 0.061428 True
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 redbone 0.506826 True miniature_pinscher 0.074192 True Rhodesian_ridgeback 0.072010 True
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German_shepherd 0.596461 True malinois 0.138584 True bloodhound 0.116197 True
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian_ridgeback 0.408143 True redbone 0.360687 True miniature_pinscher 0.222752 True
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 miniature_pinscher 0.560311 True Rottweiler 0.243682 True Doberman 0.154629 True
5 666050758794694657 https://pbs.twimg.com/media/CT5Jof1WUAEuVxN.jpg 1 Bernese_mountain_dog 0.651137 True English_springer 0.263788 True Greater_Swiss_Mountain_dog 0.016199 True
6 666051853826850816 https://pbs.twimg.com/media/CT5KoJ1WoAAJash.jpg 1 box_turtle 0.933012 False mud_turtle 0.045885 False terrapin 0.017885 False
7 666055525042405380 https://pbs.twimg.com/media/CT5N9tpXIAAifs1.jpg 1 chow 0.692517 True Tibetan_mastiff 0.058279 True fur_coat 0.054449 False
8 666057090499244032 https://pbs.twimg.com/media/CT5PY90WoAAQGLo.jpg 1 shopping_cart 0.962465 False shopping_basket 0.014594 False golden_retriever 0.007959 True
9 666058600524156928 https://pbs.twimg.com/media/CT5Qw94XAAA_2dP.jpg 1 miniature_poodle 0.201493 True komondor 0.192305 True soft-coated_wheaten_terrier 0.082086 True
10 666063827256086533 https://pbs.twimg.com/media/CT5Vg_wXIAAXfnj.jpg 1 golden_retriever 0.775930 True Tibetan_mastiff 0.093718 True Labrador_retriever 0.072427 True
11 666071193221509120 https://pbs.twimg.com/media/CT5cN_3WEAAlOoZ.jpg 1 Gordon_setter 0.503672 True Yorkshire_terrier 0.174201 True Pekinese 0.109454 True
12 666073100786774016 https://pbs.twimg.com/media/CT5d9DZXAAALcwe.jpg 1 Walker_hound 0.260857 True English_foxhound 0.175382 True Ibizan_hound 0.097471 True
13 666082916733198337 https://pbs.twimg.com/media/CT5m4VGWEAAtKc8.jpg 1 pug 0.489814 True bull_mastiff 0.404722 True French_bulldog 0.048960 True
14 666094000022159362 https://pbs.twimg.com/media/CT5w9gUW4AAsBNN.jpg 1 bloodhound 0.195217 True German_shepherd 0.078260 True malinois 0.075628 True
15 666099513787052032 https://pbs.twimg.com/media/CT51-JJUEAA6hV8.jpg 1 Lhasa 0.582330 True Shih-Tzu 0.166192 True Dandie_Dinmont 0.089688 True
16 666102155909144576 https://pbs.twimg.com/media/CT54YGiWUAEZnoK.jpg 1 English_setter 0.298617 True Newfoundland 0.149842 True borzoi 0.133649 True
17 666104133288665088 https://pbs.twimg.com/media/CT56LSZWoAAlJj2.jpg 1 hen 0.965932 False cock 0.033919 False partridge 0.000052 False
18 666268910803644416 https://pbs.twimg.com/media/CT8QCd1WEAADXws.jpg 1 desktop_computer 0.086502 False desk 0.085547 False bookcase 0.079480 False
19 666273097616637952 https://pbs.twimg.com/media/CT8T1mtUwAA3aqm.jpg 1 Italian_greyhound 0.176053 True toy_terrier 0.111884 True basenji 0.111152 True
20 666287406224695296 https://pbs.twimg.com/media/CT8g3BpUEAAuFjg.jpg 1 Maltese_dog 0.857531 True toy_poodle 0.063064 True miniature_poodle 0.025581 True
21 666293911632134144 https://pbs.twimg.com/media/CT8mx7KW4AEQu8N.jpg 1 three-toed_sloth 0.914671 False otter 0.015250 False great_grey_owl 0.013207 False
22 666337882303524864 https://pbs.twimg.com/media/CT9OwFIWEAMuRje.jpg 1 ox 0.416669 False Newfoundland 0.278407 True groenendael 0.102643 True
23 666345417576210432 https://pbs.twimg.com/media/CT9Vn7PWoAA_ZCM.jpg 1 golden_retriever 0.858744 True Chesapeake_Bay_retriever 0.054787 True Labrador_retriever 0.014241 True
24 666353288456101888 https://pbs.twimg.com/media/CT9cx0tUEAAhNN_.jpg 1 malamute 0.336874 True Siberian_husky 0.147655 True Eskimo_dog 0.093412 True
25 666362758909284353 https://pbs.twimg.com/media/CT9lXGsUcAAyUFt.jpg 1 guinea_pig 0.996496 False skunk 0.002402 False hamster 0.000461 False
26 666373753744588802 https://pbs.twimg.com/media/CT9vZEYWUAAlZ05.jpg 1 soft-coated_wheaten_terrier 0.326467 True Afghan_hound 0.259551 True briard 0.206803 True
27 666396247373291520 https://pbs.twimg.com/media/CT-D2ZHWIAA3gK1.jpg 1 Chihuahua 0.978108 True toy_terrier 0.009397 True papillon 0.004577 True
28 666407126856765440 https://pbs.twimg.com/media/CT-NvwmW4AAugGZ.jpg 1 black-and-tan_coonhound 0.529139 True bloodhound 0.244220 True flat-coated_retriever 0.173810 True
29 666411507551481857 https://pbs.twimg.com/media/CT-RugiWIAELEaq.jpg 1 coho 0.404640 False barracouta 0.271485 False gar 0.189945 False
... ... ... ... ... ... ... ... ... ... ... ... ...
2045 886366144734445568 https://pbs.twimg.com/media/DE0BTnQUwAApKEH.jpg 1 French_bulldog 0.999201 True Chihuahua 0.000361 True Boston_bull 0.000076 True
2046 886680336477933568 https://pbs.twimg.com/media/DE4fEDzWAAAyHMM.jpg 1 convertible 0.738995 False sports_car 0.139952 False car_wheel 0.044173 False
2047 886736880519319552 https://pbs.twimg.com/media/DE5Se8FXcAAJFx4.jpg 1 kuvasz 0.309706 True Great_Pyrenees 0.186136 True Dandie_Dinmont 0.086346 True
2048 886983233522544640 https://pbs.twimg.com/media/DE8yicJW0AAAvBJ.jpg 2 Chihuahua 0.793469 True toy_terrier 0.143528 True can_opener 0.032253 False
2049 887101392804085760 https://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg 1 Samoyed 0.733942 True Eskimo_dog 0.035029 True Staffordshire_bullterrier 0.029705 True
2050 887343217045368832 https://pbs.twimg.com/ext_tw_video_thumb/88734... 1 Mexican_hairless 0.330741 True sea_lion 0.275645 False Weimaraner 0.134203 True
2051 887473957103951883 https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2 Pembroke 0.809197 True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True
2052 887517139158093824 https://pbs.twimg.com/ext_tw_video_thumb/88751... 1 limousine 0.130432 False tow_truck 0.029175 False shopping_cart 0.026321 False
2053 887705289381826560 https://pbs.twimg.com/media/DFHDQBbXgAEqY7t.jpg 1 basset 0.821664 True redbone 0.087582 True Weimaraner 0.026236 True
2054 888078434458587136 https://pbs.twimg.com/media/DFMWn56WsAAkA7B.jpg 1 French_bulldog 0.995026 True pug 0.000932 True bull_mastiff 0.000903 True
2055 888202515573088257 https://pbs.twimg.com/media/DFDw2tyUQAAAFke.jpg 2 Pembroke 0.809197 True Rhodesian_ridgeback 0.054950 True beagle 0.038915 True
2056 888554962724278272 https://pbs.twimg.com/media/DFTH_O-UQAACu20.jpg 3 Siberian_husky 0.700377 True Eskimo_dog 0.166511 True malamute 0.111411 True
2057 888804989199671297 https://pbs.twimg.com/media/DFWra-3VYAA2piG.jpg 1 golden_retriever 0.469760 True Labrador_retriever 0.184172 True English_setter 0.073482 True
2058 888917238123831296 https://pbs.twimg.com/media/DFYRgsOUQAARGhO.jpg 1 golden_retriever 0.714719 True Tibetan_mastiff 0.120184 True Labrador_retriever 0.105506 True
2059 889278841981685760 https://pbs.twimg.com/ext_tw_video_thumb/88927... 1 whippet 0.626152 True borzoi 0.194742 True Saluki 0.027351 True
2060 889531135344209921 https://pbs.twimg.com/media/DFg_2PVW0AEHN3p.jpg 1 golden_retriever 0.953442 True Labrador_retriever 0.013834 True redbone 0.007958 True
2061 889638837579907072 https://pbs.twimg.com/media/DFihzFfXsAYGDPR.jpg 1 French_bulldog 0.991650 True boxer 0.002129 True Staffordshire_bullterrier 0.001498 True
2062 889665388333682689 https://pbs.twimg.com/media/DFi579UWsAAatzw.jpg 1 Pembroke 0.966327 True Cardigan 0.027356 True basenji 0.004633 True
2063 889880896479866881 https://pbs.twimg.com/media/DFl99B1WsAITKsg.jpg 1 French_bulldog 0.377417 True Labrador_retriever 0.151317 True muzzle 0.082981 False
2064 890006608113172480 https://pbs.twimg.com/media/DFnwSY4WAAAMliS.jpg 1 Samoyed 0.957979 True Pomeranian 0.013884 True chow 0.008167 True
2065 890240255349198849 https://pbs.twimg.com/media/DFrEyVuW0AAO3t9.jpg 1 Pembroke 0.511319 True Cardigan 0.451038 True Chihuahua 0.029248 True
2066 890609185150312448 https://pbs.twimg.com/media/DFwUU__XcAEpyXI.jpg 1 Irish_terrier 0.487574 True Irish_setter 0.193054 True Chesapeake_Bay_retriever 0.118184 True
2067 890729181411237888 https://pbs.twimg.com/media/DFyBahAVwAAhUTd.jpg 2 Pomeranian 0.566142 True Eskimo_dog 0.178406 True Pembroke 0.076507 True
2068 890971913173991426 https://pbs.twimg.com/media/DF1eOmZXUAALUcq.jpg 1 Appenzeller 0.341703 True Border_collie 0.199287 True ice_lolly 0.193548 False
2069 891087950875897856 https://pbs.twimg.com/media/DF3HwyEWsAABqE6.jpg 1 Chesapeake_Bay_retriever 0.425595 True Irish_terrier 0.116317 True Indian_elephant 0.076902 False
2070 891327558926688256 https://pbs.twimg.com/media/DF6hr6BUMAAzZgT.jpg 2 basset 0.555712 True English_springer 0.225770 True German_short-haired_pointer 0.175219 True
2071 891689557279858688 https://pbs.twimg.com/media/DF_q7IAWsAEuuN8.jpg 1 paper_towel 0.170278 False Labrador_retriever 0.168086 True spatula 0.040836 False
2072 891815181378084864 https://pbs.twimg.com/media/DGBdLU1WsAANxJ9.jpg 1 Chihuahua 0.716012 True malamute 0.078253 True kelpie 0.031379 True
2073 892177421306343426 https://pbs.twimg.com/media/DGGmoV4XsAAUL6n.jpg 1 Chihuahua 0.323581 True Pekinese 0.090647 True papillon 0.068957 True
2074 892420643555336193 https://pbs.twimg.com/media/DGKD1-bXoAAIAUK.jpg 1 orange 0.097049 False bagel 0.085851 False banana 0.076110 False

2075 rows × 12 columns

In [13]:
# display tweet_json
tweet_json
Out[13]:
id favorite_count retweet_count
0 892420643555336193 39467 8853
1 892177421306343426 33819 6514
2 891815181378084864 25461 4328
3 891689557279858688 42908 8964
4 891327558926688256 41048 9774
5 891087950875897856 20562 3261
6 890971913173991426 12041 2158
7 890729181411237888 56848 16716
8 890609185150312448 28226 4429
9 890240255349198849 32467 7711
10 890006608113172480 31166 7624
11 889880896479866881 28268 5156
12 889665388333682689 38818 8538
13 889638837579907072 27672 4735
14 889531135344209921 15359 2321
15 889278841981685760 25652 5637
16 888917238123831296 29611 4709
17 888804989199671297 26080 4559
18 888554962724278272 20290 3732
19 888078434458587136 22201 3653
20 887705289381826560 30779 5609
21 887517139158093824 46959 12082
22 887473957103951883 69871 18781
23 887343217045368832 34222 10737
24 887101392804085760 31061 6167
25 886983233522544640 35859 8084
26 886736880519319552 12306 3443
27 886680336477933568 22798 4610
28 886366144734445568 21524 3316
29 886267009285017600 117 4
... ... ... ...
2324 666411507551481857 459 339
2325 666407126856765440 113 44
2326 666396247373291520 172 92
2327 666373753744588802 194 100
2328 666362758909284353 804 595
2329 666353288456101888 229 77
2330 666345417576210432 307 146
2331 666337882303524864 204 96
2332 666293911632134144 522 368
2333 666287406224695296 152 71
2334 666273097616637952 184 82
2335 666268910803644416 108 37
2336 666104133288665088 14765 6871
2337 666102155909144576 81 16
2338 666099513787052032 164 73
2339 666094000022159362 169 79
2340 666082916733198337 121 47
2341 666073100786774016 335 174
2342 666071193221509120 154 67
2343 666063827256086533 496 232
2344 666058600524156928 115 61
2345 666057090499244032 304 146
2346 666055525042405380 448 261
2347 666051853826850816 1253 879
2348 666050758794694657 136 60
2349 666049248165822465 111 41
2350 666044226329800704 311 147
2351 666033412701032449 128 47
2352 666029285002620928 132 48
2353 666020888022790149 2535 532

2354 rows × 3 columns

More deep

  • Let's dive in deeper
  • Assessing of the data programmatically
In [14]:
# twitter_archive data info
twitter_archive.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2356 entries, 0 to 2355
Data columns (total 17 columns):
tweet_id                      2356 non-null int64
in_reply_to_status_id         78 non-null float64
in_reply_to_user_id           78 non-null float64
timestamp                     2356 non-null object
source                        2356 non-null object
text                          2356 non-null object
retweeted_status_id           181 non-null float64
retweeted_status_user_id      181 non-null float64
retweeted_status_timestamp    181 non-null object
expanded_urls                 2297 non-null object
rating_numerator              2356 non-null int64
rating_denominator            2356 non-null int64
name                          2356 non-null object
doggo                         2356 non-null object
floofer                       2356 non-null object
pupper                        2356 non-null object
puppo                         2356 non-null object
dtypes: float64(4), int64(3), object(10)
memory usage: 313.0+ KB
In [15]:
# statistic description of twitter archive
twitter_archive.describe()
Out[15]:
tweet_id in_reply_to_status_id in_reply_to_user_id retweeted_status_id retweeted_status_user_id rating_numerator rating_denominator
count 2.356000e+03 7.800000e+01 7.800000e+01 1.810000e+02 1.810000e+02 2356.000000 2356.000000
mean 7.427716e+17 7.455079e+17 2.014171e+16 7.720400e+17 1.241698e+16 13.126486 10.455433
std 6.856705e+16 7.582492e+16 1.252797e+17 6.236928e+16 9.599254e+16 45.876648 6.745237
min 6.660209e+17 6.658147e+17 1.185634e+07 6.661041e+17 7.832140e+05 0.000000 0.000000
25% 6.783989e+17 6.757419e+17 3.086374e+08 7.186315e+17 4.196984e+09 10.000000 10.000000
50% 7.196279e+17 7.038708e+17 4.196984e+09 7.804657e+17 4.196984e+09 11.000000 10.000000
75% 7.993373e+17 8.257804e+17 4.196984e+09 8.203146e+17 4.196984e+09 12.000000 10.000000
max 8.924206e+17 8.862664e+17 8.405479e+17 8.874740e+17 7.874618e+17 1776.000000 170.000000
In [16]:
# data sample 
twitter_archive.sample(5)
Out[16]:
tweet_id in_reply_to_status_id in_reply_to_user_id timestamp source text retweeted_status_id retweeted_status_user_id retweeted_status_timestamp expanded_urls rating_numerator rating_denominator name doggo floofer pupper puppo
1053 742465774154047488 NaN NaN 2016-06-13 21:16:49 +0000 <a href="http://twitter.com/download/iphone" r... Was just informed about this hero pupper and o... NaN NaN NaN https://twitter.com/dog_rates/status/742465774... 14 10 None None None pupper None
1503 692017291282812928 NaN NaN 2016-01-26 16:12:33 +0000 <a href="http://twitter.com/download/iphone" r... This is Kingsley Wellensworth III. He owns 7 r... NaN NaN NaN https://twitter.com/dog_rates/status/692017291... 9 10 Kingsley None None None None
1715 680221482581123072 NaN NaN 2015-12-25 03:00:14 +0000 <a href="http://twitter.com/download/iphone" r... This is CeCe. She's patiently waiting for Sant... NaN NaN NaN https://twitter.com/dog_rates/status/680221482... 10 10 CeCe None None None None
1104 735137028879360001 NaN NaN 2016-05-24 15:55:00 +0000 <a href="http://twitter.com/download/iphone" r... Meet Buckley. His family &amp; some neighbors ... NaN NaN NaN https://twitter.com/dog_rates/status/735137028... 9 10 Buckley None None pupper None
412 822975315408461824 NaN NaN 2017-01-22 01:12:59 +0000 <a href="http://twitter.com/download/iphone" r... This is Albus. He's soaked as h*ck. Seems to h... NaN NaN NaN https://twitter.com/dog_rates/status/822975315... 12 10 Albus None None None None
In [17]:
# check for source column
twitter_archive.source.value_counts()
Out[17]:
<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>     2221
<a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a>                          91
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                       33
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>      11
Name: source, dtype: int64
In [18]:
# check for the dog's name written style
twitter_archive.name.str.istitle().value_counts()
Out[18]:
True     2241
False     115
Name: name, dtype: int64
In [19]:
# check for those written as lowercase
lowers = twitter_archive.name.loc[twitter_archive.name.str.islower()].unique()
lowers
Out[19]:
array(['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad', 'an',
       'very', 'just', 'my', 'his', 'actually', 'getting', 'this',
       'unacceptable', 'all', 'old', 'infuriating', 'the', 'by',
       'officially', 'life', 'light', 'space'], dtype=object)
In [20]:
# check for the unique values of those non titled
untitled = twitter_archive.name.loc[twitter_archive.name.str.istitle() == False].unique()
untitled
Out[20]:
array(['such', 'a', 'quite', 'not', 'one', 'incredibly', 'BeBe', 'mad',
       'an', 'very', 'just', 'DonDon', 'my', 'his', 'actually', 'getting',
       'this', 'unacceptable', 'all', 'old', 'infuriating', 'CeCe', 'the',
       'by', 'officially', 'life', 'light', 'space', 'JD', 'DayZ'],
      dtype=object)
In [21]:
# check for those mis-written 
untitled_unlowers = [i for i in untitled if  i not in lowers]  
untitled_unlowers
Out[21]:
['BeBe', 'DonDon', 'CeCe', 'JD', 'DayZ']

As we are interested in this project with the rating of Dogs so Let's focus more on the columns related to rating i.e rating_numerator and rating_denominator

In [22]:
# check for denominator values below 10
pd.set_option('display.max_colwidth',-1)
twitter_archive.loc[twitter_archive.rating_denominator <10 , ['text','rating_numerator','rating_denominator']]
Out[22]:
text rating_numerator rating_denominator
313 @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho 960 0
516 Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx 24 7
2335 This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv 1 2
In [23]:
# check for rating denominator values > 10
twitter_archive.loc[twitter_archive.rating_denominator >10 ,['text','rating_numerator','rating_denominator']]
Out[23]:
text rating_numerator rating_denominator
342 @docmisterio account started on 11/15/15 11 15
433 The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd 84 70
784 RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/… 9 11
902 Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE 165 150
1068 After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https://t.co/XAVDNDaVgQ 9 11
1120 Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv 204 170
1165 Happy 4/20 from the squad! 13/10 for all https://t.co/eV1diwds8a 4 20
1202 This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq 50 50
1228 Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 99 90
1254 Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 80 80
1274 From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK 45 50
1351 Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa 60 50
1433 Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ 44 40
1598 Yes I do realize a rating of 4/20 would've been fitting. However, it would be unjust to give these cooperative pups that low of a rating 4 20
1634 Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 143 130
1635 Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 121 110
1662 This is Darrel. He just robbed a 7/11 and is in a high speed police chase. Was just spotted by the helicopter 10/10 https://t.co/7EsP8LmSp5 7 11
1663 I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible 20 16
1779 IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq 144 120
1843 Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw 88 80
  • Note : the account started on 11/15/15
In [24]:
# check for rating_numerator <10
twitter_archive.loc[twitter_archive.rating_numerator < 10,['text','rating_numerator','rating_denominator']]
Out[24]:
text rating_numerator rating_denominator
45 This is Bella. She hopes her smile made you smile. If not, she is also offering you her favorite monkey. 13.5/10 https://t.co/qjrljjt948 5 10
229 This is Jerry. He's doing a distinguished tongue slip. Slightly patronizing tbh. You think you're better than us, Jerry? 6/10 hold me back https://t.co/DkOBbwulw1 6 10
315 When you're so blinded by your systematic plagiarism that you forget what day it is. 0/10 https://t.co/YbEJPkg4Ag 0 10
387 I was going to do 007/10, but the joke wasn't worth the &lt;10 rating 7 10
462 RT @dog_rates: Meet Herschel. He's slightly bigger than ur average pupper. Looks lonely. Could probably ride 7/10 would totally pet https:/… 7 10
485 RT @dog_rates: Meet Beau &amp; Wilbur. Wilbur stole Beau's bed from him. Wilbur now has so much room for activities. 9/10 for both pups https:/… 9 10
599 RT @dog_rates: Here we see a rare pouched pupper. Ample storage space. Looks alert. Jumps at random. Kicked open that door. 8/10 https://t.… 8 10
605 RT @dog_rates: Not familiar with this breed. No tail (weird). Only 2 legs. Doesn't bark. Surprisingly quick. Shits eggs. 1/10 https://t.co/… 1 10
730 Who keeps sending in pictures without dogs in them? This needs to stop. 5/10 for the mediocre road https://t.co/ELqelxWMrC 5 10
745 RT @dog_rates: This is Hank. He's mischievous af. Doesn't even know what he was trying to do here. 8/10 quit the shit Hank damn https://t.c… 8 10
764 RT @dog_rates: Meet Gerald. He's a fairly exotic doggo. Floofy af. Inadequate knees tho. Self conscious about large forehead. 8/10 https://… 8 10
765 This is Wesley. He's clearly trespassing. Seems rather h*ckin violent too. Weaponized forehead. 3/10 wouldn't let in https://t.co/pL7wbMRW7M 3 10
784 RT @dog_rates: After so many requests, this is Bretagne. She was the last surviving 9/11 search dog, and our second ever 14/10. RIP https:/… 9 11
814 Another pic without a dog in it? What am I supposed to do? Rate the carpet? Fine I will. 7/10 looks adequately comfy https://t.co/OJZQ6I4gGd 7 10
835 Meet Gerald. He's a fairly exotic doggo. Floofy af. Inadequate knees tho. Self conscious about large forehead. 8/10 https://t.co/WmczvjCWJq 8 10
845 This is Shadoe. Her tongue flies out of her mouth at random. Can't have a serious conversation with her. 9/10 https://t.co/Tytt15VquG 9 10
859 This is Clark. He collects teddy bears. It's absolutely h*ckin horrifying. 8/10 please stop this Clark https://t.co/EDMcwt86fU 8 10
860 RT @dog_rates: Meet Eve. She's a raging alcoholic 8/10 (would b 11/10 but pupper alcoholism is a tragic issue that I can't condone) https:/… 8 10
883 This is Fido. He can tell the weather. Not good at fetch tho. Never comes when called. 4/10 would probably still pet https://t.co/4gOv2Q3iKP 4 10
896 Meet Toby. He has a drinking problem. Inflatable marijuana plant in the back is also not a good look. 7/10 cmon Toby https://t.co/Cim4DSj6Oi 7 10
906 Teagan reads entire books in store so they're free. Loved 50 Shades of Grey (how dare I make that joke so late) 9/10 https://t.co/l46jwv5WYv 9 10
912 Here's another picture without a dog in it. Idk why you guys keep sending these. 4/10 just because that's a neat rug https://t.co/mOmnL19Wsl 4 10
915 This is Devón (pronounced "Eric"). He forgot how to eat the apple halfway through. Wtf Devón get it together. 8/10 https://t.co/7waRPODGyO 8 10
936 This is Hank. He's mischievous af. Doesn't even know what he was trying to do here. 8/10 quit the shit Hank damn https://t.co/3r7wjfsXHc 8 10
946 Here are three doggos completely misjudging an airborne stick. Decent efforts tho. All 9/10 https://t.co/HCXQL4fGVZ 9 10
948 Here's a doggo trying to catch some fish. 8/10 futile af (vid by @KellyBauerx) https://t.co/jwd0j6oWLE 8 10
956 Please stop sending it pictures that don't even have a doggo or pupper in them. Churlish af. 5/10 neat couch tho https://t.co/u2c9c7qSg8 5 10
964 This is Malcolm. He's absolutely terrified of heights. 8/10 hang in there pupper https://t.co/SVU00Sc9U2 8 10
966 This is Zoe. She was trying to stealthily take a picture of you but you just noticed. 9/10 not so sneaky pupper https://t.co/FfH3o88Vta 9 10
987 This is Dietrich. He hops at random. Other doggos don't understand him. It upsets him greatly. 8/10 would comfort https://t.co/U8cSRz8wzC 8 10
... ... ... ...
2312 This is Josep. He is a Rye Manganese mix. Can drive w eyes closed. Very irresponsible. Menace on the roadways. 5/10 https://t.co/XNGeDwrtYH 5 10
2314 This is a golden Buckminsterfullerene named Johm. Drives trucks. Lumberjack (?). Enjoys wall. 8/10 would hug softly https://t.co/uQbZJM2DQB 8 10
2315 This is Christoper. He is a spotted Penne. Can easily navigate stairs. 8/10 https://t.co/bg4TqvvkuF 8 10
2316 Cool dog. Enjoys couch. Low monotone bark. Very nice kicks. Pisses milk (must be rare). Can't go down stairs. 4/10 https://t.co/vXMKrJC81s 4 10
2317 This is Jimothy. He is a Botwanian Gouda. Can write (impressive). Very erect tail. Still looking for hoco date. 9/10 https://t.co/LEkZjZxESQ 9 10
2319 This is Scout. She is a black Downton Abbey. Isn't afraid to get dirty. 9/10 nothing bad to say https://t.co/kH60oka1HW 9 10
2320 Here we see a lone northeastern Cumberbatch. Half ladybug. Only builds with bricks. Very confident with body. 7/10 https://t.co/7LtjBS0GPK 7 10
2322 Oh boy what a pup! Sunglasses take this one to the next level. Weirdly folds front legs. Pretty big. 6/10 https://t.co/yECbFrSArM 6 10
2323 Here we have an Austrian Pulitzer. Collectors edition. Levitates (?). 7/10 would garden with https://t.co/NMQq6HIglK 7 10
2326 This is quite the dog. Gets really excited when not in water. Not very soft tho. Bad at fetch. Can't do tricks. 2/10 https://t.co/aMCTNWO94t 2 10
2327 This is a southern Vesuvius bumblegruff. Can drive a truck (wow). Made friends with 5 other nifty dogs (neat). 7/10 https://t.co/LopTBkKa8h 7 10
2328 Oh goodness. A super rare northeast Qdoba kangaroo mix. Massive feet. No pouch (disappointing). Seems alert. 9/10 https://t.co/Dc7b0E8qFE 9 10
2330 Unique dog here. Very small. Lives in container of Frosted Flakes (?). Short legs. Must be rare 6/10 would still pet https://t.co/XMD9CwjEnM 6 10
2331 Here we have a mixed Asiago from the Galápagos Islands. Only one ear working. Big fan of marijuana carpet. 8/10 https://t.co/tltQ5w9aUO 8 10
2333 This is an extremely rare horned Parthenon. Not amused. Wears shoes. Overall very nice. 9/10 would pet aggressively https://t.co/QpRjllzWAL 9 10
2334 This is a funny dog. Weird toes. Won't come down. Loves branch. Refuses to eat his food. Hard to cuddle with. 3/10 https://t.co/IIXis0zta0 3 10
2335 This is an Albanian 3 1/2 legged Episcopalian. Loves well-polished hardwood flooring. Penis on the collar. 9/10 https://t.co/d9NcXFKwLv 1 2
2338 Not familiar with this breed. No tail (weird). Only 2 legs. Doesn't bark. Surprisingly quick. Shits eggs. 1/10 https://t.co/Asgdc6kuLX 1 10
2340 Can stand on stump for what seems like a while. Built that birdhouse? Impressive. Made friends with a squirrel. 8/10 https://t.co/Ri4nMTLq5C 8 10
2341 This appears to be a Mongolian Presbyterian mix. Very tired. Tongue slip confirmed. 9/10 would lie down with https://t.co/mnioXo3IfP 9 10
2342 Here we have a well-established sunblockerspaniel. Lost his other flip-flop. 6/10 not very waterproof https://t.co/3RU6x0vHB7 6 10
2344 Here we have a northern speckled Rhododendron. Much sass. Gives 0 fucks. Good tongue. 9/10 would caress sensually https://t.co/ZoL8kq2XFx 9 10
2346 Here is the Rand Paul of retrievers folks! He's probably good at poker. Can drink beer (lol rad). 8/10 good dog https://t.co/pYAJkAe76p 8 10
2347 My oh my. This is a rare blond Canadian terrier on wheels. Only $8.98. Rather docile. 9/10 very rare https://t.co/yWBqbrzy8O 9 10
2349 This is an odd dog. Hard on the outside but loving on the inside. Petting still fun. Doesn't play catch well. 2/10 https://t.co/v5A4vzSDdc 2 10
2351 Here we have a 1949 1st generation vulpix. Enjoys sweat tea and Fox News. Cannot be phased. 5/10 https://t.co/4B7cOc1EDq 5 10
2352 This is a purebred Piers Morgan. Loves to Netflix and chill. Always looks like he forgot to unplug the iron. 6/10 https://t.co/DWnyCjf2mx 6 10
2353 Here is a very happy pup. Big fan of well-maintained decks. Just look at that tongue. 9/10 would cuddle af https://t.co/y671yMhoiR 9 10
2354 This is a western brown Mitsubishi terrier. Upset about leaf. Actually 2 dogs here. 7/10 would walk the shit out of https://t.co/r7mOb2m0UI 7 10
2355 Here we have a Japanese Irish Setter. Lost eye in Vietnam (?). Big fan of relaxing on stair. 8/10 would pet https://t.co/BLDqew2Ijj 8 10

440 rows × 3 columns

In [25]:
# check for rating_numerator values > 10
twitter_archive.loc[twitter_archive.rating_numerator > 14,['text','rating_numerator','rating_denominator']]
Out[25]:
text rating_numerator rating_denominator
55 @roushfenway These are good dogs but 17/10 is an emotional impulse rating. More like 13/10s 17 10
188 @dhmontgomery We also gave snoop dogg a 420/10 but I think that predated your research 420 10
189 @s8n You tried very hard to portray this good boy as not so good, but you have ultimately failed. His goodness shines through. 666/10 666 10
285 RT @KibaDva: I collected all the good dogs!! 15/10 @dog_rates #GoodDogs https://t.co/6UCGFczlOI 15 10
290 @markhoppus 182/10 182 10
291 @bragg6of8 @Andy_Pace_ we are still looking for the first 15/10 15 10
313 @jonnysun @Lin_Manuel ok jomny I know you're excited but 960/00 isn't a valid rating, 13/10 is tho 960 0
340 RT @dog_rates: This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wu… 75 10
433 The floofs have been released I repeat the floofs have been released. 84/70 https://t.co/NIYC820tmd 84 70
516 Meet Sam. She smiles 24/7 &amp; secretly aspires to be a reindeer. \nKeep Sam smiling by clicking and sharing this link:\nhttps://t.co/98tB8y7y7t https://t.co/LouL5vdvxx 24 7
695 This is Logan, the Chow who lived. He solemnly swears he's up to lots of good. H*ckin magical af 9.75/10 https://t.co/yBO5wuqaPS 75 10
763 This is Sophie. She's a Jubilant Bush Pupper. Super h*ckin rare. Appears at random just to smile at the locals. 11.27/10 would smile back https://t.co/QFaUiIHxHq 27 10
902 Why does this never happen at my front door... 165/150 https://t.co/HmwrdfEfUE 165 150
979 This is Atticus. He's quite simply America af. 1776/10 https://t.co/GRXwMxLBkh 1776 10
1120 Say hello to this unbelievably well behaved squad of doggos. 204/170 would try to pet all at once https://t.co/yGQI3He3xv 204 170
1202 This is Bluebert. He just saw that both #FinalFur match ups are split 50/50. Amazed af. 11/10 https://t.co/Kky1DPG4iq 50 50
1228 Happy Saturday here's 9 puppers on a bench. 99/90 good work everybody https://t.co/mpvaVxKmc1 99 90
1254 Here's a brigade of puppers. All look very prepared for whatever happens next. 80/80 https://t.co/0eb7R1Om12 80 80
1274 From left to right:\nCletus, Jerome, Alejandro, Burp, &amp; Titson\nNone know where camera is. 45/50 would hug all at once https://t.co/sedre1ivTK 45 50
1351 Here is a whole flock of puppers. 60/50 I'll take the lot https://t.co/9dpcw6MdWa 60 50
1433 Happy Wednesday here's a bucket of pups. 44/40 would pet all at once https://t.co/HppvrYuamZ 44 40
1634 Two sneaky puppers were not initially seen, moving the rating to 143/130. Please forgive us. Thank you https://t.co/kRK51Y5ac3 143 130
1635 Someone help the girl is being mugged. Several are distracting her while two steal her shoes. Clever puppers 121/110 https://t.co/1zfnTJLt55 121 110
1663 I'm aware that I could've said 20/16, but here at WeRateDogs we are very professional. An inconsistent rating scale is simply irresponsible 20 16
1712 Here we have uncovered an entire battalion of holiday puppers. Average of 11.26/10 https://t.co/eNm2S6p9BD 26 10
1779 IT'S PUPPERGEDDON. Total of 144/120 ...I think https://t.co/ZanVtAtvIq 144 120
1843 Here we have an entire platoon of puppers. Total score: 88/80 would pet all at once https://t.co/y93p6FLvVw 88 80
2074 After so many requests... here you go.\n\nGood dogg. 420/10 https://t.co/yfAAo1gdeY 420 10

Important points here:

  • The account has its own rating system and that is quiet clear here specially that rate of 14/10 looks normal
  • form the above I found some scores as outliers 1776,420 that needs more invistigate eith to include or drop
  • Also those above 100 these seems to be related to more that one dogs in a photo
  • some typos as 75 instead of 9.75 , 26 instead of 11.26 ,27 instead of 11.27 11/10 instead of 50/50
  • some times they are using the float numbers
  • I will collect these observations for further cleaning
In [26]:
# check for the text 
twitter_archive.text.sample(5).tolist()
Out[26]:
['This is Herm. He just wants to be like the other dogs. Sneaky tongue slip. Super fuzzy. 9/10 would cuddle firmly https://t.co/tg8h9lzCHv',
 'Meet Fizz. She thinks love is a social construct consisting solely of ideals perpetuated by mass media 11/10 woke af https://t.co/sPB5JMnWBn',
 "IT'S SO SMALL ERMERGERF 11/10 https://t.co/dNUbKOSiWW",
 "This is Eriq. His friend just reminded him of last year's super bowl. Not cool friend\n10/10 for Eriq\n6/10 for friend https://t.co/PlEXTofdpf",
 "This is Winnie. She lost her body saving a children's hospital from an avalanche. 13/10 what a h*ckin hero https://t.co/Tf0rh9ZgZe"]
  • during my check for samples from text column I noticed the sentence We only rate dogs , Let's invistigate about this
In [27]:
# check inside the text values for non dog related tweets
twitter_archive.text[twitter_archive.text.str.match('.*only rate dogs')]
Out[27]:
25      This... is a Jubilant Antarctic House Bear. We only rate dogs. Please only send dogs. Thank you... 12/10 would suffocate in floof https://t.co/4Ad1jzJSdp         
59      Ugh not again. We only rate dogs. Please don't send in well-dressed  floppy-tongued street penguins. Dogs only please. Thank you... 12/10 https://t.co/WiAMbTkDPf 
93      I can't believe this keeps happening. This, is a birb taking a bath. We only rate dogs. Please only send dogs. Thank you... 12/10 https://t.co/pwY9PQhtP2         
118     RT @dog_rates: We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10…                      
127     Unbelievable. We only rate dogs. Please don't send in non-canines like the "I" from Pixar's opening credits. Thank you... 12/10 https://t.co/JMhDNv5wXZ           
131     Oh my this spooked me up. We only rate dogs, not happy ghosts. Please send dogs only. It's a very simple premise. Thank you... 13/10 https://t.co/M5Rz0R8SIQ      
141     We only rate dogs. Please don't send in Jesus. We're trying to remain professional and legitimate. Thank you... 14/10 https://t.co/wr3xsjeCIR                     
154     We only rate dogs. Please don't send perfectly toasted marshmallows attempting to drive. Thank you... 13/10 https://t.co/nvZyyrp0kd                               
169     We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 https://t.co/g2nSyGenG9              
193     Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective https://t.co/Xc7uj1C64x
221     Seriously guys? Again? We only rate dogs. Please stop submitting other things like this super good hammerhead shark. Thank you... 12/10 https://t.co/TCMC90mSOT   
226     Please stop sending in animals other than dogs. We only rate dogs. Not Furry Ecuadorian Sea Turtles. Thank you... 12/10 https://t.co/UOE79zb6VU                   
246     C'mon guys. Please only send in dogs. We only rate dogs, not Exceptional-Tongued Peruvian Floor Bears. Thank you... 12/10 https://t.co/z30iQLiXNo                 
314     We only rate dogs. Please don't send in any non-canines like this Floppy Tongued House Panda. Thank you... 12/10 would still pet https://t.co/8fX2VkExnL          
386     RT @dog_rates: Please only send in dogs. We only rate dogs, not seemingly heartbroken ewoks. Thank you... still 10/10 would console https:/…                      
390     We only rate dogs. Please don't send in any more non-dogs like this Wild Albanian Street Moose. Thank you... 11/10 https://t.co/srXL2s868C                        
411     RT @dog_rates: We only rate dogs. Please don't send pics of men capturing low level clouds. Thank you... 11/10 https://t.co/rLi83ZyCL5                            
416     Please stop sending in non-canines like this Very Pettable Dozing Bath Tortoise. We only rate dogs. Only send dogs... 12/10 https://t.co/mcagPeENIh               
419     We only rate dogs. Please don't send pics of men capturing low level clouds. Thank you... 11/10 https://t.co/rLi83ZyCL5                                           
494     We only rate dogs. Please don't send in other things like this very good Christmas tree. Thank you... 13/10 https://t.co/rvSANEsQZJ                               
506     RT @dog_rates: Meet Sammy. At first I was like "that's a snowflake. we only rate dogs," but he would've melted by now, so 10/10 https://t.c…                      
521     Please only send in dogs. We only rate dogs, not seemingly heartbroken ewoks. Thank you... still 10/10 would console https://t.co/HIraYS1Bzo                      
538     RT @dog_rates: Idk why this keeps happening. We only rate dogs. Not Bangladeshi Couch Chipmunks. Please only send dogs... 12/10 https://t.c…                      
542     We only rate dogs. Please stop sending in non-canines like this Freudian Poof Lion. This is incredibly frustrating... 11/10 https://t.co/IZidSrBvhi               
732     Idk why this keeps happening. We only rate dogs. Not Bangladeshi Couch Chipmunks. Please only send dogs... 12/10 https://t.co/ya7bviQUUf                          
744     We only rate dogs. Pls stop sending in non-canines like this Urban Floof Giraffe. I can't handle this. 11/10 https://t.co/zHIqpM5Gni                              
759     RT @dog_rates: This is an East African Chalupa Seal. We only rate dogs. Please only send in dogs. Thank you... 10/10 https://t.co/iHe6liLwWR                      
773     RT @dog_rates: We only rate dogs. Pls stop sending in non-canines like this Mongolian grass snake. This is very frustrating. 11/10 https://…                      
801     Guys this is getting so out of hand. We only rate dogs. This is a Galapagos Speed Panda. Pls only send dogs... 10/10 https://t.co/8lpAGaZRFn                      
819     We only rate dogs. Pls stop sending in non-canines like this Arctic Floof Kangaroo. This is very frustrating. 11/10 https://t.co/qlUDuPoE3d                       
                                                                           ...                                                                                            
1017    This is a carrot. We only rate dogs. Please only send in dogs. You all really should know this by now ...11/10 https://t.co/9e48aPrBm2                            
1031    We only rate dogs. Pls stop sending in non-canines like this Jamaican Flop Seal. This is very very frustrating. 9/10 https://t.co/nc53zEN0hZ                      
1071    This is getting incredibly frustrating. This is a Mexican Golden Beaver. We only rate dogs. Only send dogs ...10/10 https://t.co/0yolOOyD3X                       
1077    This... is a Tyrannosaurus rex. We only rate dogs. Please only send in dogs. Thank you ...10/10 https://t.co/zxw8d5g94P                                           
1085    For the last time, we only rate dogs. Pls stop sending other animals like this Duck-Billed Platypus. Thank you. 9/10 https://t.co/twxYcPOafl                      
1090    We only rate dogs. Pls stop sending in non-canines like this Slovak Car Bunny. It makes my job very difficult. 11/10 https://t.co/VflvQLH2y5                      
1097    We only rate dogs. Please stop sending in non-canines like this Alaskan Flop Turtle. This is very frustrating. 10/10 https://t.co/qXteK6Atxc                      
1102    We only rate dogs. Please stop sending in your 31 year old sons that won't get out of your house. Thank you... 11/10 https://t.co/aTU53NNUkt                      
1121    We only rate dogs. Pls stop sending non-canines like this Bulgarian Eyeless Porch Bear. This is unacceptable... 9/10 https://t.co/2yctWAUZ3Z                      
1150    I'm getting super heckin frustrated with you all sending in non canines like this ostrich. We only rate dogs... 9/10 https://t.co/Rgbni2Ns8z                      
1193    People please. This is a Deadly Mediterranean Plop T-Rex. We only rate dogs. Only send in dogs. Thanks you... 11/10 https://t.co/2ATDsgHD4n                       
1207    This is a taco. We only rate dogs. Please only send in dogs. Dogs are what we rate. Not tacos. Thank you... 10/10 https://t.co/cxl6xGY8B9                         
1225    I know we only rate dogs, but since it's Easter I guess we could rate a bunny for a change. 10/10 petable as hell https://t.co/O2RlKXigHu                         
1234    Please don't send in any more polar bears. We only rate dogs. Thank you... 10/10 https://t.co/83RGhdIQz2                                                          
1290    Please stop sending in non canines like this Guatemalan Twiggle Bunny. We only rate dogs. Only send in dogs... 11/10 https://t.co/XKhobeGuvT                      
1356    Really guys? Again? I know this is a rare Albanian Bingo Seal, but we only rate dogs. Only send in dogs... 9/10 https://t.co/6JYLpUmBrC                           
1362    This is an East African Chalupa Seal. We only rate dogs. Please only send in dogs. Thank you... 10/10 https://t.co/iHe6liLwWR                                     
1385    We only rate dogs. Pls stop sending in non-canines like this Mongolian grass snake. This is very frustrating. 11/10 https://t.co/22x9SbCYCU                       
1435    Please stop sending in saber-toothed tigers. This is getting ridiculous. We only rate dogs.\n...8/10 https://t.co/iAeQNueou8                                      
1527    Stop sending in lobsters. This is the final warning. We only rate dogs. Thank you... 9/10 https://t.co/B9ZXXKJYNx                                                 
1564    Please send dogs. I'm tired of seeing other stuff like this dangerous pirate. We only rate dogs. Thank you... 10/10 https://t.co/YdLytdZOqv                       
1610    For the last time, WE. DO. NOT. RATE. BULBASAUR. We only rate dogs. Please only send dogs. Thank you ...9/10 https://t.co/GboDG8WhJG                              
1693    This is actually a lion. We only rate dogs. For the last time please only send dogs. Thank u.\n12/10 would still pet https://t.co/Pp26dMQxap                      
1699    I thought I made this very clear. We only rate dogs. Stop sending other things like this shark. Thank you... 9/10 https://t.co/CXSJZ4Stk3                         
1726    Meet Sammy. At first I was like "that's a snowflake. we only rate dogs," but he would've melted by now, so 10/10 https://t.co/MQfPK4zwuh                          
1737    Guys this really needs to stop. We've been over this way too many times. This is a giraffe. We only rate dogs.. 7/10 https://t.co/yavgkHYPOC                      
1745    I know everyone's excited for Christmas but that doesn't mean you can send in reindeer. We only rate dogs... 8/10 https://t.co/eWjWgbOCYL                         
1854    Seriously guys?! Only send in dogs. I only rate dogs. This is a baby black bear... 11/10 https://t.co/H7kpabTfLj                                                  
1877    C'mon guys. We've been over this. We only rate dogs. This is a cow. Please only submit dogs. Thank you...... 9/10 https://t.co/WjcELNEqN2                         
1938    Guys I'm getting real tired of this. We only rate dogs. Please don't send in other things like this Bulbasaur. 3/10 https://t.co/t5rQHl6W8M                       
Name: text, Length: 64, dtype: object
In [28]:
# check the expanded urls column
twitter_archive.expanded_urls.sample(5)
Out[28]:
48      https://twitter.com/dog_rates/status/882992080364220416/photo/1,https://twitter.com/dog_rates/status/882992080364220416/photo/1
1288    https://twitter.com/dog_rates/status/708349470027751425/photo/1                                                                
1774    NaN                                                                                                                            
2274    https://twitter.com/dog_rates/status/667455448082227200/photo/1                                                                
1206    https://vine.co/v/hYdLVKDpAFu                                                                                                  
Name: expanded_urls, dtype: object
  • So this sentence used by the account's admin to address that picture doesn't contain a dog!
In [29]:
# check for how many time this issue occur?
len(twitter_archive.text[twitter_archive.text.str.match('.*only rate dogs')])
Out[29]:
64
In [30]:
# image_pred data info
image_pred.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2075 entries, 0 to 2074
Data columns (total 12 columns):
tweet_id    2075 non-null int64
jpg_url     2075 non-null object
img_num     2075 non-null int64
p1          2075 non-null object
p1_conf     2075 non-null float64
p1_dog      2075 non-null bool
p2          2075 non-null object
p2_conf     2075 non-null float64
p2_dog      2075 non-null bool
p3          2075 non-null object
p3_conf     2075 non-null float64
p3_dog      2075 non-null bool
dtypes: bool(3), float64(3), int64(2), object(4)
memory usage: 152.1+ KB
In [31]:
# statistic description of image_pred
image_pred.describe()
Out[31]:
tweet_id img_num p1_conf p2_conf p3_conf
count 2.075000e+03 2075.000000 2075.000000 2.075000e+03 2.075000e+03
mean 7.384514e+17 1.203855 0.594548 1.345886e-01 6.032417e-02
std 6.785203e+16 0.561875 0.271174 1.006657e-01 5.090593e-02
min 6.660209e+17 1.000000 0.044333 1.011300e-08 1.740170e-10
25% 6.764835e+17 1.000000 0.364412 5.388625e-02 1.622240e-02
50% 7.119988e+17 1.000000 0.588230 1.181810e-01 4.944380e-02
75% 7.932034e+17 1.000000 0.843855 1.955655e-01 9.180755e-02
max 8.924206e+17 4.000000 1.000000 4.880140e-01 2.734190e-01
In [32]:
# data sample 
image_pred.sample(5)
Out[32]:
tweet_id jpg_url img_num p1 p1_conf p1_dog p2 p2_conf p2_dog p3 p3_conf p3_dog
533 676897532954456065 https://pbs.twimg.com/media/CWTSt0UW4AALMNB.jpg 1 hamster 0.628255 False guinea_pig 0.318646 False macaque 0.013058 False
662 682638830361513985 https://pbs.twimg.com/media/CXk4W0qWYAMEMEs.jpg 1 English_springer 0.440781 True Cardigan 0.411182 True Border_collie 0.022412 True
1578 796149749086875649 https://pbs.twimg.com/media/Cwx99rpW8AMk_Ie.jpg 1 golden_retriever 0.600276 True Labrador_retriever 0.140798 True seat_belt 0.087355 False
1442 775364825476165632 https://pbs.twimg.com/media/CsKmMB2WAAAXcAy.jpg 3 beagle 0.571229 True Chihuahua 0.175257 True Pembroke 0.034306 True
2034 883838122936631299 https://pbs.twimg.com/media/DEQGFgAXUAAEvfi.jpg 1 Doberman 0.610946 True miniature_pinscher 0.299603 True kelpie 0.063020 True
In [33]:
# number of dogs breeds
image_pred.p1.value_counts()
Out[33]:
golden_retriever             150
Labrador_retriever           100
Pembroke                     89 
Chihuahua                    83 
pug                          57 
chow                         44 
Samoyed                      43 
toy_poodle                   39 
Pomeranian                   38 
malamute                     30 
cocker_spaniel               30 
French_bulldog               26 
Chesapeake_Bay_retriever     23 
miniature_pinscher           23 
seat_belt                    22 
Staffordshire_bullterrier    20 
German_shepherd              20 
Siberian_husky               20 
Cardigan                     19 
web_site                     19 
beagle                       18 
Shetland_sheepdog            18 
Eskimo_dog                   18 
teddy                        18 
Maltese_dog                  18 
Rottweiler                   17 
Lakeland_terrier             17 
Shih-Tzu                     17 
kuvasz                       16 
Italian_greyhound            16 
                             .. 
minibus                      1  
washer                       1  
beaver                       1  
beach_wagon                  1  
sulphur-crested_cockatoo     1  
clog                         1  
fiddler_crab                 1  
scorpion                     1  
rain_barrel                  1  
hare                         1  
hay                          1  
tick                         1  
robin                        1  
fire_engine                  1  
radio_telescope              1  
dhole                        1  
timber_wolf                  1  
groenendael                  1  
bannister                    1  
loupe                        1  
otter                        1  
teapot                       1  
standard_schnauzer           1  
swab                         1  
starfish                     1  
suit                         1  
piggy_bank                   1  
conch                        1  
bonnet                       1  
water_bottle                 1  
Name: p1, Length: 378, dtype: int64
In [34]:
# tweet_json data info
tweet_json.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2354 entries, 0 to 2353
Data columns (total 3 columns):
id                2354 non-null int64
favorite_count    2354 non-null int64
retweet_count     2354 non-null int64
dtypes: int64(3)
memory usage: 55.2 KB
In [35]:
# tweet_json statistics
tweet_json.describe()
Out[35]:
id favorite_count retweet_count
count 2.354000e+03 2354.000000 2354.000000
mean 7.426978e+17 8080.968564 3164.797366
std 6.852812e+16 11814.771334 5284.770364
min 6.660209e+17 0.000000 0.000000
25% 6.783975e+17 1415.000000 624.500000
50% 7.194596e+17 3603.500000 1473.500000
75% 7.993058e+17 10122.250000 3652.000000
max 8.924206e+17 132810.000000 79515.000000
In [36]:
# data sample 
tweet_json.sample(5)
Out[36]:
id favorite_count retweet_count
109 871102520638267392 21461 5764
1007 747512671126323200 6110 1803
2093 670764103623966721 1154 466
2030 671763349865160704 1788 999
806 771908950375665664 7298 2181
In [37]:
# check for datasets shape and completeness
twitter_archive.shape[0], tweet_json.shape[0] , image_pred.shape[0]
Out[37]:
(2356, 2354, 2075)
In [38]:
# duplicate columns in the three datasets
all_columns = pd.Series(list(twitter_archive ) + list(tweet_json) +list(image_pred))
all_columns[all_columns.duplicated()]
Out[38]:
20    tweet_id
dtype: object

Notes

Qulity

  • twitter_archive

    • Missing Values :

      • in_reply_to_status_id, in_reply_to_user_id : 78 instead of 2356
      • retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp 181 instead of 2356
      • expanded_urls : 2297 instead of 2356
    • We are interested in the tweet ONLY not the retweet
    • We are interested in the tweet ONLY not the reply to the original tweet

    • tweet_id is saved as int datatype instead of/ "better to be" string (object)

    • timestamp , retweeted_status_timestamp are saved as object datatype (str) instead of date/timestamp
    • source column is writen in html containg <a> tags
    • column name :

      • some values are not titled untitled_unlowers ('BeBe','DonDon','CeCe',, 'JD', 'DayZ')
      • some are inacuarte values : lowers ['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad','an', 'very', 'just', 'my', 'his', 'actually', 'getting','this', 'unacceptable', 'all', 'old', 'infuriating', 'the','by', 'officially', 'life', 'light', 'space']
    • rating_numerator & rating_denominator:

      • datatype for rating_numerator should be float instead of int
      • fix:
        • @45 13.5/10 instead of 5/10
        • @ 313 13/10 instead of 960/0
        • @ 2335 : 9/10 instead of 1/2
        • @ 1068 : 14/10 instead of 9/11
        • @1165: 13/10 instead of 4/20
        • @ 1202 : 11/10 instead of 50/50
        • @ 1662 10/10 instead of 7/11
        • @ 695 : 9.75/10 instead of 75/10
        • @763 : 11.27/10 instead of 27/10
        • @1712 :11.26/10 instead of 26/10
      • drop:
        • @ 516 no rating
        • @342 inaccurate (account start date)
      • invistigate(outliers):
    • columns doggo,floofer,pupper, puppo has None values instead of Null.
    • We are interested in dogs , text column reveals the truth about that some tweets are not related to dogs
    • expanded_urls is too bulky we are interested in tweet link only.

  • image_pred
    • some images are not for dogs
    • tweet_id is saved as int datatype instead of object datatype
    • replace the underscore in breeds values with space and title all breeds values (p1 &p2& p3)

  • twitter_json

    • column id is saved as int datatype instead of object datatype & rename as tweet_id

  • All_datasets

    • we have completeness issue not all the datasets have the same number of observation

Tidiness

  • twitter_archive

    • text column has two variables text and short urls,create short_urls column, drop expanded_urls
    • The values of four columns (doggo,floofer,pupper,puppo) in twitter_archive dataset should be in one column dog_stage with a category datatype.
    • rating_numerator and rating_denominator columns in twitter_archive dataset should form one column dog_rating normalized out of 10.
    • make new columns for day_name and month from the timstamp column

  • image_pred

    • Columns p1, p1_dog, p1_conf , p2, p2_dog, p2_conf , p3, p3_dog, p3_conf could be condenced to two columns dog_breed and confidence

  • All datasets

    • tweet_id is present in two datasets and after renaming it will appear in all datasets
    • tweet_json and image_pred datasets should be part of our main dataset twitter_archive.

Clean

In [39]:
# make a copy of the datasets
twitter_archive_clean = twitter_archive.copy()
image_pred_clean = image_pred.copy()
tweet_json_clean =  tweet_json.copy()

First things first

  • Let's start with the missing values

1 Missing Values

twitter_archive

  • Missing Values :

    • in_reply_to_status_id, in_reply_to_user_id : 78 instead of 2356
    • retweeted_status_id,retweeted_status_user_id,retweeted_status_timestamp 181 instead of 2356
    • expanded_urls : 2297 instead of 2356 (to be fixed later) #### Define
      • in the twitter_archive dataset we will keep only recodrs that:
      • 1 Are not associated with retweets.
      • 2 Are not associated with reply to the original tweet.
      • i.e we will keep the NaN values for these columns and drop non NaN values
      • Drop columns:
    • in_reply_to_status_id
    • in_reply_to_user_id
    • retweeted_status_id
    • retweeted_status_user_id
    • retweeted_status_timestamp**

Code

In [40]:
twitter_archive_clean = twitter_archive_clean.query('in_reply_to_status_id == "NaN" &\
                                                     in_reply_to_user_id == "NaN" &\
                                                    retweeted_status_id == "NaN" &\
                                                    retweeted_status_user_id == "NaN"')
# drop columns
xcolumns = ['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
           'retweeted_status_user_id', 'retweeted_status_timestamp']
twitter_archive_clean = twitter_archive_clean.drop(columns = xcolumns, axis=1)

Test

In [41]:
# check for Null values in the twitter_archive clean versions
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 12 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
expanded_urls         2094 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
doggo                 2097 non-null object
floofer               2097 non-null object
pupper                2097 non-null object
puppo                 2097 non-null object
dtypes: int64(3), object(9)
memory usage: 213.0+ KB

Tideness

1 twitter_archive

  • text column has two variables text and short urls,create short_urls column, drop expanded_urls

Define

  • use split method by ' ' over the text column, and apply over row
  • create short_urls column
  • drop expanded_urls column
  • split the text column by https: and assign its value to the same column name

Code

In [42]:
# create short_urls column by use split method over the text column, and apply over row
twitter_archive_clean['short_urls'] = twitter_archive_clean.text.apply(lambda x :x.strip().split(' ')[-1])
# drop the expanded_urls
twitter_archive_clean.drop('expanded_urls', axis =1, inplace=True)
# split the text column by `https:` and assign its value to the same column name
twitter_archive_clean.text = twitter_archive_clean.text.apply(lambda x:x.split('https:')[0])

Test

In [43]:
twitter_archive_clean.sample()
Out[43]:
tweet_id timestamp source text rating_numerator rating_denominator name doggo floofer pupper puppo short_urls
1108 734559631394082816 2016-05-23 01:40:38 +0000 <a href="http://vine.co" rel="nofollow">Vine - Make a Scene</a> When your friend is turnt af and you're just trying to chill. 10/10 (vid by @presgang) 10 10 None None None None None https://t.co/OufVDk23JC
In [44]:
# check for column droping
assert 'expanded_urls' not in twitter_archive_clean.columns

Tideness

2 twitter_archive

  • The values of four columns (doggo,floofer,pupper,puppo) in twitter_archive dataset should be in one column dog_stage with a category datatype.

Define

  • select the last 4 columns related to the different dog stages
  • replace the 'None' string with np.nan in the selected columns
  • create a dog_stage column joinig all the values in the selected values droping nan, convert to str
  • convert the dog_stage column type to categorical
  • drop the columns related to the previous 4 stages

Code

In [45]:
# select the dog stages columns from the dataset
all_dogs_type = ['doggo', 'floofer', 'pupper', 'puppo']
# replace the 'None' string with np.nan 
twitter_archive_clean[all_dogs_type] = twitter_archive_clean[all_dogs_type].replace('None', np.nan)

# create the dog_stage column with joining the four columns in one column dog_stage join for more than stage
twitter_archive_clean['dog_stage'] = twitter_archive_clean[all_dogs_type].\
                                        apply(lambda x: ', '.join(x.dropna().astype(str)),axis =1)
# replace the empty string with nan and change datatype to category
twitter_archive_clean.dog_stage = twitter_archive_clean.dog_stage.replace('', np.nan).astype('category')

# drop the 4 columns
twitter_archive_clean = twitter_archive_clean.drop(columns = all_dogs_type, axis =1)

Test

In [46]:
# check for the data columns and datatype
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
rating_numerator      2097 non-null int64
rating_denominator    2097 non-null int64
name                  2097 non-null object
short_urls            2097 non-null object
dog_stage             336 non-null category
dtypes: category(1), int64(3), object(5)
memory usage: 149.9+ KB
In [47]:
# check for the values of the new column
twitter_archive_clean.dog_stage.value_counts()
Out[47]:
pupper            221
doggo             72 
puppo             23 
floofer           9  
doggo, pupper     9  
doggo, puppo      1  
doggo, floofer    1  
Name: dog_stage, dtype: int64

Quality

2 rating_numerator & rating_denominator:

  • datatype for rating_numerator should be float instead of int

Define

  • convert the datatype of rating_numerator to float by astype('float')

Code

In [48]:
#convert the datatype of rating_numerator to float by astype('float')
twitter_archive_clean.rating_numerator = twitter_archive_clean.rating_numerator.astype('float')

Test

In [49]:
# check for the datatype
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2097 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id              2097 non-null int64
timestamp             2097 non-null object
source                2097 non-null object
text                  2097 non-null object
rating_numerator      2097 non-null float64
rating_denominator    2097 non-null int64
name                  2097 non-null object
short_urls            2097 non-null object
dog_stage             336 non-null category
dtypes: category(1), float64(1), int64(2), object(5)
memory usage: 149.9+ KB

Quality

3 rating_numerator & rating_denominator:

  • fix:
    • @45 13.5/10 instead of 5/10
    • @ 313 13/10 instead of 960/0
    • @ 2335 : 9/10 instead of 1/2
    • @ 1068 : 14/10 instead of 9/11
    • @1165: 13/10 instead of 4/20
    • @ 1202 : 11/10 instead of 50/50
    • @ 1662 10/10 instead of 7/11
    • @ 695 : 9.75/10 instead of 75/10
    • @763 : 11.27/10 instead of 27/10
    • @1712 :11.26/10 instead of 26/10

4 rating_numerator & rating_denominator:

  • drop:
    • @ 516 no rating
    • @342 inaccurate (account start date) retweets& replys are already droped

Define

  • check for the above indices if exist
  • get a list of the indices of the erros after check
  • set a list for the correct values relative to those indices
  • loop through the two lists and assign each index with the new correct value
  • drop [516] by index

Code

In [50]:
# check for index if exist
indices = [45,313,2335,1068,1165,1202,1662,695,763,1712,516,342]
for i in indices:
    if i in list(twitter_archive_clean.index):
        print('yes')
    else: 
        print(f'No : {i} ')
yes
No : 313 
yes
yes
yes
yes
yes
yes
yes
yes
yes
No : 342 
In [51]:
#get a list of the indices of the erros after check
indices = [45,2335,1068,1165,1202,1662,695,763,1712]
# set a list for the correct values relative to those indices
vals = [13.5,9,14,13,11,10,9.75,11.27,11.26]
# loop through the two lists and assign each index with the new correct value
for i,val in zip(indices,vals):
    twitter_archive_clean.loc[i, 'rating_numerator'] = val
    twitter_archive_clean.loc[i, 'rating_denominator'] =10
# drop the index: 516 
twitter_archive_clean.drop(index=516,inplace=True)

Test

In [52]:
# test for value for one of the entries
assert twitter_archive_clean.loc[1712,'rating_numerator'] ==11.26
# test for droping index=516
assert 516 not in list(twitter_archive_clean.index)
In [53]:
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2096 entries, 0 to 2355
Data columns (total 9 columns):
tweet_id              2096 non-null int64
timestamp             2096 non-null object
source                2096 non-null object
text                  2096 non-null object
rating_numerator      2096 non-null float64
rating_denominator    2096 non-null int64
name                  2096 non-null object
short_urls            2096 non-null object
dog_stage             336 non-null category
dtypes: category(1), float64(1), int64(2), object(5)
memory usage: 229.8+ KB
In [54]:
# check for the rating_denominator values
twitter_archive_clean.rating_denominator.value_counts()
Out[54]:
10     2085
80     2   
50     2   
170    1   
150    1   
120    1   
110    1   
90     1   
70     1   
40     1   
Name: rating_denominator, dtype: int64

Tideness

3 twitter_archive

  • rating_numerator and rating_denominator columns in twitter_archive dataset should form one column dog_rating normalized out of 10.

Define

  • divide the rating_numerator / rating_denominator and then mulitiply by 10
  • make dog_score column
  • drop the columns rating_numerator & rating_denominator column

Code

In [55]:
#divide the rating_numerator / rating_denominator and then mulitiply by 10 & make dog_score column
twitter_archive_clean['dog_score'] = 10 * twitter_archive_clean.rating_numerator / twitter_archive_clean.rating_denominator
#drop the columns rating_numerator & rating_denominator column
twitter_archive_clean.drop(['rating_numerator','rating_denominator'],axis=1,inplace=True)

Test

In [56]:
# check for values in the dog_score column
twitter_archive_clean.dog_score.value_counts()
Out[56]:
12.00      490
10.00      438
11.00      419
13.00      288
9.00       154
8.00       98 
7.00       51 
14.00      39 
5.00       33 
6.00       32 
3.00       19 
4.00       15 
2.00       9  
1.00       4  
9.75       1  
0.00       1  
11.26      1  
11.27      1  
13.50      1  
420.00     1  
1776.00    1  
Name: dog_score, dtype: int64
In [57]:
# check for the twitter_archive_clean data
twitter_archive_clean.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2096 entries, 0 to 2355
Data columns (total 8 columns):
tweet_id      2096 non-null int64
timestamp     2096 non-null object
source        2096 non-null object
text          2096 non-null object
name          2096 non-null object
short_urls    2096 non-null object
dog_stage     336 non-null category
dog_score     2096 non-null float64
dtypes: category(1), float64(1), int64(1), object(5)
memory usage: 213.4+ KB

Quality

5 fix the tweet_id columns in all datasets

Define

  • rename the id column in twitter_json to tweet_id
  • change the datatype to str(object) for tweet_id column in all datasets

Code

In [58]:
# rename the id column in twitter_json to tweet_id
tweet_json_clean.columns = ['tweet_id', 'favorite_count', 'retweet_count']

# change the datatype to str(object) in all datasets
datasets = [twitter_archive_clean,image_pred_clean,tweet_json_clean]
for i in datasets:
    i.tweet_id = i.tweet_id.astype('object')

Test

In [59]:
# check for the datatypes for tweet_id in all datasets
for i in datasets:
    assert i.tweet_id.dtypes == 'object'

Tideness

4 image_pred dataset condence the columns p1,p1_dog_p1_conf,...etc to dog_breed, confidence

  • we are interested in images of dogs only
  • we are are going to select those have at least one prediction for dog among the top three prediction

Define

  • define a dog_breed_confidence function to extract the dog_breed and confience from the top 3 predictions
  • apply the function row wise
  • assign the new column names dog_breed and confidence
  • drop the un needed columns now

Qulaity issues Now:

Define

  • rename the No breed values with np.nan
  • replace the underscore with space and title all breeds values

Code

In [60]:
breed = []
confidence = []

# define the function
def dog_breed_confidence(data):
    if data.p1_dog:
        breed.append(data.p1)
        confidence.append(data.p1_conf)
    elif data.p2_dog:
        breed.append(data.p2)
        confidence.append(data.p2_conf)
    elif data.p3_dog :
        breed.append(data.p3)
        confidence.append(data.p3_conf)
    else:
        breed.append('No breed')
        confidence.append(0)
# apply the function row wise        
image_pred_clean.apply(dog_breed_confidence,axis =1)
# assign the new column names
image_pred_clean['dog_breed'] = breed
image_pred_clean['confidence'] = confidence
# drop the un needed columns now
image_pred_clean.drop(columns = ['p1', 'p1_dog', 'p1_conf' , 'p2', 'p2_dog',
                                 'p2_conf' , 'p3', 'p3_dog', 'p3_conf'],axis=1, inplace =True)
# rename the No breed values with np.nan
image_pred_clean.replace('No breed',np.nan, inplace=True)
# replace the underscore with space and title all breeds values
image_pred_clean.dog_breed= image_pred_clean.dog_breed.str.replace('_',' ').str.title()

Test

In [61]:
# check the top 5 rows in image_pred_clean
image_pred_clean.head()
Out[61]:
tweet_id jpg_url img_num dog_breed confidence
0 666020888022790149 https://pbs.twimg.com/media/CT4udn0WwAA0aMy.jpg 1 Welsh Springer Spaniel 0.465074
1 666029285002620928 https://pbs.twimg.com/media/CT42GRgUYAA5iDo.jpg 1 Redbone 0.506826
2 666033412701032449 https://pbs.twimg.com/media/CT4521TWwAEvMyu.jpg 1 German Shepherd 0.596461
3 666044226329800704 https://pbs.twimg.com/media/CT5Dr8HUEAA-lEu.jpg 1 Rhodesian Ridgeback 0.408143
4 666049248165822465 https://pbs.twimg.com/media/CT5IQmsXIAAKY4A.jpg 1 Miniature Pinscher 0.560311

Tideness

5 tweet_json and image_pred datasets should be part of our main dataset twitter_archive.

  • we are interested in the retweet_count and favorite_count from tweet_json and keeping the original data
  • we are interested in the tweets that have images

Define

  • use the merge function to merge twitter_archive_clean and tweet_json_clean on tweet_id column (left join)
  • use the merge function to merge twitter_archive_clean and image_pred_clean on tweet_id column (inner join)
  • make the master dataset

Code

In [62]:
# use the merge function to merge twitter_archive_clean and tweet_json_clean on tweet_id column (left join)
twitter_archive_clean = pd.merge(twitter_archive_clean, tweet_json_clean , how = 'left' , on = 'tweet_id')

# use the merge function to merge `twitter_archive_clean` and `image_pred_clean` on tweet_id column (inner join) 
# and make master dataset
master_dataset = pd.merge(twitter_archive_clean, image_pred_clean , how = 'inner' , on = 'tweet_id')

Test

In [63]:
# check new dataset after merge 
master_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1970 entries, 0 to 1969
Data columns (total 14 columns):
tweet_id          1970 non-null object
timestamp         1970 non-null object
source            1970 non-null object
text              1970 non-null object
name              1970 non-null object
short_urls        1970 non-null object
dog_stage         303 non-null category
dog_score         1970 non-null float64
favorite_count    1970 non-null int64
retweet_count     1970 non-null int64
jpg_url           1970 non-null object
img_num           1970 non-null int64
dog_breed         1665 non-null object
confidence        1970 non-null float64
dtypes: category(1), float64(2), int64(3), object(8)
memory usage: 217.8+ KB
In [64]:
# check that all records have an image
master_dataset.jpg_url.isnull().sum()
Out[64]:
0

Quality

6 source column is writen in html containg <a> tags

Define

  • check for the unique values in source columns to know how to extract the needed value
  • make a function fix_source which extract the strings between tags (>text<)
  • use apply function to fix the source column row wise

Code

In [65]:
# check for the unique values
master_dataset.source.unique()
Out[65]:
array(['<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
       '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
       '<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>'],
      dtype=object)
In [66]:
#make a function fix_source which extract the strings between tags
def fix_source(i):
    'i is an html string from the source column in twitter_archive_clean dataset'
    #find the first closed  tag >
    x= i.find('>') + 1
    # find the first open tag after the previous <
    y =i[x:].find('<')
    # extract the text in between
    return i[x:][:y]
In [67]:
# use apply function to fix the source column row wise
master_dataset.source= master_dataset.source.apply(lambda x: fix_source(x))

Test

In [68]:
# check for the result values in the source column
master_dataset.source.value_counts()
Out[68]:
Twitter for iPhone    1931
Twitter Web Client    28  
TweetDeck             11  
Name: source, dtype: int64
  • The Vine records are lost in merging the datasets as they haven't a jpg_url they are vedio links not passed in the model.

Quality

7 timestamp is saved as object datatype (str) instead of date/timestamp

Define

  • change the datatype of timestamp column to datetime

Code

In [69]:
# change the datatype of timestamp column to datetime 
master_dataset.timestamp = pd.to_datetime(master_dataset.timestamp)

Test

In [70]:
# check for the datatype
master_dataset.timestamp.dtype
Out[70]:
datetime64[ns, UTC]

Tideness

6 make new columns for day_name and month for more analysis

Define

  • extract the month name from the timestamp column
  • extract the day name from the timestamp column

Code

In [71]:
# extract the month name
master_dataset['month'] = master_dataset.timestamp.apply(lambda x: x.month_name())
#extarct the day_name
master_dataset['day_name'] = master_dataset.timestamp.apply(lambda x: x.day_name())

Test

In [72]:
# check for the top 5 rows in columns timestamp, day_name and month
master_dataset.loc[:5,['timestamp','day_name','month']]
Out[72]:
timestamp day_name month
0 2017-08-01 16:23:56+00:00 Tuesday August
1 2017-08-01 00:17:27+00:00 Tuesday August
2 2017-07-31 00:18:03+00:00 Monday July
3 2017-07-30 15:58:51+00:00 Sunday July
4 2017-07-29 16:00:24+00:00 Saturday July
5 2017-07-29 00:08:17+00:00 Saturday July
In [73]:
# chheck for the datatypes
master_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1970 entries, 0 to 1969
Data columns (total 16 columns):
tweet_id          1970 non-null object
timestamp         1970 non-null datetime64[ns, UTC]
source            1970 non-null object
text              1970 non-null object
name              1970 non-null object
short_urls        1970 non-null object
dog_stage         303 non-null category
dog_score         1970 non-null float64
favorite_count    1970 non-null int64
retweet_count     1970 non-null int64
jpg_url           1970 non-null object
img_num           1970 non-null int64
dog_breed         1665 non-null object
confidence        1970 non-null float64
month             1970 non-null object
day_name          1970 non-null object
dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(3), object(9)
memory usage: 328.5+ KB

Quality

7 column name :

  • rename to dog_name
  • some values are not titled untitled_unlowers ('BeBe','DonDon','CeCe',, 'JD', 'DayZ')
  • some are inacuarte values : lowers ['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad','an', 'very', 'just', 'my', 'his', 'actually', 'getting','this', 'unacceptable', 'all', 'old', 'infuriating', 'the','by', 'officially', 'life', 'light', 'space']

Define

  • rename the column to dog_name
  • converted lower names to np.nan
  • make all values titled
  • relace 'None' values with np.nan values

Code

In [74]:
# rename the name column to dog_name
master_dataset.rename(columns={'name':'dog_name'},inplace=True)
# converted lower names to np.nan
lowers = master_dataset.dog_name.str.islower()
master_dataset.loc[lowers,'dog_name'] = 'None'
# make all values titled
master_dataset.dog_name = master_dataset.dog_name.apply(lambda x: x.title())
# relace 'None' with np.nan values
master_dataset.dog_name.replace('None', np.nan, inplace= True)

Test

In [75]:
# check for all is titled
master_dataset.dog_name.str.istitle().value_counts()
Out[75]:
True    1348
Name: dog_name, dtype: int64
In [76]:
# assert for our work
assert [i.title() in master_dataset.dog_name.unique() for i in untitled_unlowers]
assert [i in master_dataset.dog_name.unique() for i  in lowers]
assert 'dog_name' in master_dataset.columns
In [77]:
# check for dog_name frequencies
master_dataset.dog_name.value_counts()
Out[77]:
Charlie      11
Lucy         10
Cooper       10
Oliver       10
Tucker       9 
Penny        9 
Sadie        8 
Winston      8 
Lola         7 
Daisy        7 
Toby         7 
Stanley      6 
Koda         6 
Jax          6 
Bo           6 
Bella        6 
Chester      5 
Scout        5 
Bailey       5 
Louis        5 
Oscar        5 
Leo          5 
Rusty        5 
Buddy        5 
Dave         5 
Milo         5 
Phil         4 
Gary         4 
George       4 
Jack         4 
            .. 
Amy          1 
Jordy        1 
Mosby        1 
Akumi        1 
Bookstore    1 
Kallie       1 
Olaf         1 
Sparky       1 
Stewie       1 
Bronte       1 
Humphrey     1 
Eriq         1 
Orion        1 
Eugene       1 
Pavlov       1 
Bayley       1 
Philbert     1 
Remus        1 
Kevon        1 
Alf          1 
Sweet        1 
Ike          1 
Enchilada    1 
Mojo         1 
Timber       1 
Kial         1 
Dallas       1 
Lulu         1 
Bloo         1 
Puff         1 
Name: dog_name, Length: 912, dtype: int64
In [78]:
# check for data names
master_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1970 entries, 0 to 1969
Data columns (total 16 columns):
tweet_id          1970 non-null object
timestamp         1970 non-null datetime64[ns, UTC]
source            1970 non-null object
text              1970 non-null object
dog_name          1348 non-null object
short_urls        1970 non-null object
dog_stage         303 non-null category
dog_score         1970 non-null float64
favorite_count    1970 non-null int64
retweet_count     1970 non-null int64
jpg_url           1970 non-null object
img_num           1970 non-null int64
dog_breed         1665 non-null object
confidence        1970 non-null float64
month             1970 non-null object
day_name          1970 non-null object
dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(3), object(9)
memory usage: 248.5+ KB

Quality

8 We are interested in dogs , text column reveals the truth about that some tweets are not related to dogs

Define

  • check for the text column for only rate dogs as it is used by the account admin to address that photo is not a dog
  • confirm that has no name in dog_name column
  • drop the rows that contains this text using their indices

Code

In [79]:
# check for the text column for only rate dogs in text and null value for dog_name
not_dogs = master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')]
# check for number of records
len(not_dogs)
Out[79]:
53
In [80]:
# explore data
not_dogs
Out[80]:
tweet_id timestamp source text dog_name short_urls dog_stage dog_score favorite_count retweet_count jpg_url img_num dog_breed confidence month day_name
24 887101392804085760 2017-07-18 00:07:08+00:00 Twitter for iPhone This... is a Jubilant Antarctic House Bear. We only rate dogs. Please only send dogs. Thank you... 12/10 would suffocate in floof NaN https://t.co/4Ad1jzJSdp NaN 12.0 31061 6167 https://pbs.twimg.com/media/DE-eAq6UwAA-jaE.jpg 1 Samoyed 0.733942 July Tuesday
52 880872448815771648 2017-06-30 19:35:32+00:00 Twitter for iPhone Ugh not again. We only rate dogs. Please don't send in well-dressed floppy-tongued street penguins. Dogs only please. Thank you... 12/10 NaN https://t.co/WiAMbTkDPf NaN 12.0 21734 3989 https://pbs.twimg.com/media/DDl8zzJW0AAisCJ.jpg 1 Pembroke 0.791416 June Friday
77 874057562936811520 2017-06-12 00:15:36+00:00 Twitter for iPhone I can't believe this keeps happening. This, is a birb taking a bath. We only rate dogs. Please only send dogs. Thank you... 12/10 NaN https://t.co/pwY9PQhtP2 NaN 12.0 23134 4125 https://pbs.twimg.com/media/DCFGtdoXkAEsqIw.jpg 1 Flat-Coated Retriever 0.832177 June Monday
103 867900495410671616 2017-05-26 00:29:37+00:00 Twitter for iPhone Unbelievable. We only rate dogs. Please don't send in non-canines like the "I" from Pixar's opening credits. Thank you... 12/10 NaN https://t.co/JMhDNv5wXZ NaN 12.0 24964 4439 https://pbs.twimg.com/media/DAtm5MkXoAA4R6P.jpg 1 Labrador Retriever 0.522644 May Friday
106 867051520902168576 2017-05-23 16:16:06+00:00 Twitter for iPhone Oh my this spooked me up. We only rate dogs, not happy ghosts. Please send dogs only. It's a very simple premise. Thank you... 13/10 NaN https://t.co/M5Rz0R8SIQ NaN 13.0 33420 8425 https://pbs.twimg.com/media/DAhiwb0XcAA8x5Q.jpg 1 Samoyed 0.471403 May Tuesday
113 864873206498414592 2017-05-17 16:00:15+00:00 Twitter for iPhone We only rate dogs. Please don't send in Jesus. We're trying to remain professional and legitimate. Thank you... 14/10 NaN https://t.co/wr3xsjeCIR NaN 14.0 33651 9361 https://pbs.twimg.com/media/DAClmHkXcAA1kSv.jpg 2 NaN 0.000000 May Wednesday
123 862096992088072192 2017-05-10 00:08:34+00:00 Twitter for iPhone We only rate dogs. Please don't send perfectly toasted marshmallows attempting to drive. Thank you... 13/10 NaN https://t.co/nvZyyrp0kd NaN 13.0 66437 21840 https://pbs.twimg.com/media/C_bIo7QXYAAGfPu.jpg 2 Chow 0.677589 May Wednesday
134 859196978902773760 2017-05-02 00:04:57+00:00 Twitter for iPhone We only rate dogs. This is quite clearly a smol broken polar bear. We'd appreciate if you only send dogs. Thank you... 12/10 NaN https://t.co/g2nSyGenG9 NaN 12.0 75193 25661 https://pbs.twimg.com/ext_tw_video_thumb/859196962498805762/pu/img/-yBpr4-o4GJZECYE.jpg 1 Malamute 0.216163 May Tuesday
147 855459453768019968 2017-04-21 16:33:22+00:00 Twitter for iPhone Guys, we only rate dogs. This is quite clearly a bulbasaur. Please only send dogs. Thank you... 12/10 human used pet, it's super effective NaN https://t.co/Xc7uj1C64x NaN 12.0 31657 8987 https://pbs.twimg.com/media/C98z1ZAXsAEIFFn.jpg 2 Blenheim Spaniel 0.389513 April Friday
169 849776966551130114 2017-04-06 00:13:11+00:00 Twitter for iPhone Seriously guys? Again? We only rate dogs. Please stop submitting other things like this super good hammerhead shark. Thank you... 12/10 NaN https://t.co/TCMC90mSOT NaN 12.0 32390 8404 https://pbs.twimg.com/media/C8sDpDWWsAE5P08.jpg 2 Chihuahua 0.292092 April Thursday
173 848690551926992896 2017-04-03 00:16:10+00:00 Twitter for iPhone Please stop sending in animals other than dogs. We only rate dogs. Not Furry Ecuadorian Sea Turtles. Thank you... 12/10 NaN https://t.co/UOE79zb6VU NaN 12.0 27104 4826 https://pbs.twimg.com/media/C8cnjHuXsAAoZQf.jpg 1 Flat-Coated Retriever 0.823648 April Monday
187 845677943972139009 2017-03-25 16:45:08+00:00 Twitter for iPhone C'mon guys. Please only send in dogs. We only rate dogs, not Exceptional-Tongued Peruvian Floor Bears. Thank you... 12/10 NaN https://t.co/z30iQLiXNo NaN 12.0 27154 5365 https://pbs.twimg.com/media/C7xzmngWkAAAp9C.jpg 1 Chow 0.808681 March Saturday
231 835172783151792128 2017-02-24 17:01:22+00:00 Twitter for iPhone We only rate dogs. Please don't send in any non-canines like this Floppy Tongued House Panda. Thank you... 12/10 would still pet NaN https://t.co/8fX2VkExnL NaN 12.0 28552 6516 https://pbs.twimg.com/media/C5chM_jWAAQmov9.jpg 2 Border Collie 0.663138 February Friday
289 826240494070030336 2017-01-31 01:27:39+00:00 Twitter for iPhone We only rate dogs. Please don't send in any more non-dogs like this Wild Albanian Street Moose. Thank you... 11/10 NaN https://t.co/srXL2s868C NaN 11.0 14614 2965 https://pbs.twimg.com/media/C3dlVMbXAAUd-Gh.jpg 1 French Bulldog 0.903048 January Tuesday
308 822610361945911296 2017-01-21 01:02:48+00:00 Twitter for iPhone Please stop sending in non-canines like this Very Pettable Dozing Bath Tortoise. We only rate dogs. Only send dogs... 12/10 NaN https://t.co/mcagPeENIh NaN 12.0 16327 3423 https://pbs.twimg.com/media/C2p_wQyXEAELtvS.jpg 1 Cocker Spaniel 0.664487 January Saturday
311 822244816520155136 2017-01-20 00:50:15+00:00 Twitter for iPhone We only rate dogs. Please don't send pics of men capturing low level clouds. Thank you... 11/10 NaN https://t.co/rLi83ZyCL5 NaN 11.0 38832 11421 https://pbs.twimg.com/media/C2kzTGxWEAEOpPL.jpg 1 Samoyed 0.585441 January Friday
363 813187593374461952 2016-12-26 01:00:05+00:00 Twitter for iPhone We only rate dogs. Please don't send in other things like this very good Christmas tree. Thank you... 13/10 NaN https://t.co/rvSANEsQZJ NaN 13.0 22085 5096 https://pbs.twimg.com/media/C0kFzOQUoAAt6yb.jpg 1 Golden Retriever 0.888181 December Monday
386 809920764300447744 2016-12-17 00:38:52+00:00 Twitter for iPhone Please only send in dogs. We only rate dogs, not seemingly heartbroken ewoks. Thank you... still 10/10 would console NaN https://t.co/HIraYS1Bzo NaN 10.0 17250 4521 https://pbs.twimg.com/media/Cz1qo05XUAQ4qXp.jpg 1 Norwich Terrier 0.397163 December Saturday
400 806219024703037440 2016-12-06 19:29:28+00:00 Twitter for iPhone We only rate dogs. Please stop sending in non-canines like this Freudian Poof Lion. This is incredibly frustrating... 11/10 NaN https://t.co/IZidSrBvhi NaN 11.0 7145 1388 https://pbs.twimg.com/media/CzBD7MWVIAA5ptx.jpg 1 Chow 0.835102 December Tuesday
526 781524693396357120 2016-09-29 16:03:01+00:00 Twitter for iPhone Idk why this keeps happening. We only rate dogs. Not Bangladeshi Couch Chipmunks. Please only send dogs... 12/10 NaN https://t.co/ya7bviQUUf NaN 12.0 23163 6426 https://pbs.twimg.com/media/CtiIj0AWcAEBDvw.jpg 1 Chesapeake Bay Retriever 0.003523 September Thursday
535 780192070812196864 2016-09-25 23:47:39+00:00 Twitter for iPhone We only rate dogs. Pls stop sending in non-canines like this Urban Floof Giraffe. I can't handle this. 11/10 NaN https://t.co/zHIqpM5Gni NaN 11.0 9712 2589 https://pbs.twimg.com/media/CtPMhwvXYAIt6NG.jpg 1 Vizsla 0.144012 September Sunday
577 772581559778025472 2016-09-04 23:46:12+00:00 Twitter for iPhone Guys this is getting so out of hand. We only rate dogs. This is a Galapagos Speed Panda. Pls only send dogs... 10/10 NaN https://t.co/8lpAGaZRFn NaN 10.0 7192 1968 https://pbs.twimg.com/media/CrjC0JAWAAAjz6n.jpg 3 Newfoundland 0.574345 September Sunday
591 770655142660169732 2016-08-30 16:11:18+00:00 Twitter for iPhone We only rate dogs. Pls stop sending in non-canines like this Arctic Floof Kangaroo. This is very frustrating. 11/10 NaN https://t.co/qlUDuPoE3d NaN 11.0 8130 2013 https://pbs.twimg.com/media/CrHqwjWXgAAgJSe.jpg 1 NaN 0.000000 August Tuesday
631 761745352076779520 2016-08-06 02:06:59+00:00 Twitter for iPhone Guys.. we only rate dogs. Pls don't send any more pics of the Loch Ness Monster. Only send in dogs. Thank you. 11/10 NaN https://t.co/obH5vMbm1j NaN 11.0 4707 979 https://pbs.twimg.com/media/CpJDWqhW8AAFt45.jpg 1 NaN 0.000000 August Saturday
645 759923798737051648 2016-08-01 01:28:46+00:00 Twitter for iPhone We only rate dogs... this is a Taiwanese Guide Walrus. Im getting real heckin tired of this. Please send dogs. 10/10 NaN https://t.co/49hkNAsubi NaN 10.0 16284 6521 https://pbs.twimg.com/media/CovKqSYVIAAUbUW.jpg 1 Labrador Retriever 0.324579 August Monday
751 746872823977771008 2016-06-26 01:08:52+00:00 Twitter for iPhone This is a carrot. We only rate dogs. Please only send in dogs. You all really should know this by now ...11/10 NaN https://t.co/9e48aPrBm2 NaN 11.0 6593 2429 https://pbs.twimg.com/media/Cl1s1p7WMAA44Vk.jpg 1 Pembroke 0.540201 June Sunday
761 745422732645535745 2016-06-22 01:06:43+00:00 Twitter for iPhone We only rate dogs. Pls stop sending in non-canines like this Jamaican Flop Seal. This is very very frustrating. 9/10 NaN https://t.co/nc53zEN0hZ NaN 9.0 9412 2768 https://pbs.twimg.com/media/ClhGBCAWIAAFCsz.jpg 1 Labrador Retriever 0.663800 June Wednesday
796 740214038584557568 2016-06-07 16:09:13+00:00 Twitter for iPhone This is getting incredibly frustrating. This is a Mexican Golden Beaver. We only rate dogs. Only send dogs ...10/10 NaN https://t.co/0yolOOyD3X NaN 10.0 7335 2220 https://pbs.twimg.com/media/CkXEu2OUoAAs8yU.jpg 1 Chesapeake Bay Retriever 0.586414 June Tuesday
801 739544079319588864 2016-06-05 19:47:03+00:00 Twitter for iPhone This... is a Tyrannosaurus rex. We only rate dogs. Please only send in dogs. Thank you ...10/10 NaN https://t.co/zxw8d5g94P NaN 10.0 43694 24319 https://pbs.twimg.com/media/CkNjahBXAAQ2kWo.jpg 1 Labrador Retriever 0.967397 June Sunday
808 738184450748633089 2016-06-02 01:44:22+00:00 Twitter for iPhone For the last time, we only rate dogs. Pls stop sending other animals like this Duck-Billed Platypus. Thank you. 9/10 NaN https://t.co/twxYcPOafl NaN 9.0 4727 1360 https://pbs.twimg.com/media/Cj6O1G9UYAAIU-1.jpg 1 Bedlington Terrier 0.289471 June Thursday
813 737678689543020544 2016-05-31 16:14:39+00:00 Twitter for iPhone We only rate dogs. Pls stop sending in non-canines like this Slovak Car Bunny. It makes my job very difficult. 11/10 NaN https://t.co/VflvQLH2y5 NaN 11.0 5528 1509 https://pbs.twimg.com/media/CjzC2oGWYAAyIfG.jpg 1 Pembroke 0.935307 May Tuesday
819 736225175608430592 2016-05-27 15:58:54+00:00 Twitter for iPhone We only rate dogs. Please stop sending in non-canines like this Alaskan Flop Turtle. This is very frustrating. 10/10 NaN https://t.co/qXteK6Atxc NaN 10.0 8901 3115 https://pbs.twimg.com/media/CjeY5DKXEAA3WkD.jpg 1 Labrador Retriever 0.399217 May Friday
824 735274964362878976 2016-05-25 01:03:06+00:00 Twitter for iPhone We only rate dogs. Please stop sending in your 31 year old sons that won't get out of your house. Thank you... 11/10 NaN https://t.co/aTU53NNUkt NaN 11.0 9629 4707 https://pbs.twimg.com/media/CjQ4radW0AENP-m.jpg 1 NaN 0.000000 May Wednesday
842 730924654643314689 2016-05-13 00:56:32+00:00 Twitter for iPhone We only rate dogs. Pls stop sending non-canines like this Bulgarian Eyeless Porch Bear. This is unacceptable... 9/10 NaN https://t.co/2yctWAUZ3Z NaN 9.0 6682 2234 https://pbs.twimg.com/media/CiTEFjDXAAAqU6I.jpg 1 Newfoundland 0.086241 May Friday
869 726224900189511680 2016-04-30 01:41:23+00:00 Twitter for iPhone I'm getting super heckin frustrated with you all sending in non canines like this ostrich. We only rate dogs... 9/10 NaN https://t.co/Rgbni2Ns8z NaN 9.0 4811 1302 https://pbs.twimg.com/media/ChQRsYaW0AETD7z.jpg 1 Standard Poodle 0.261112 April Saturday
911 717537687239008257 2016-04-06 02:21:30+00:00 Twitter for iPhone People please. This is a Deadly Mediterranean Plop T-Rex. We only rate dogs. Only send in dogs. Thanks you... 11/10 NaN https://t.co/2ATDsgHD4n NaN 11.0 6281 2069 https://pbs.twimg.com/media/CfU0t75W4AAUo9V.jpg 1 Golden Retriever 0.779356 April Wednesday
921 715733265223708672 2016-04-01 02:51:22+00:00 Twitter for iPhone This is a taco. We only rate dogs. Please only send in dogs. Dogs are what we rate. Not tacos. Thank you... 10/10 NaN https://t.co/cxl6xGY8B9 NaN 10.0 5093 1920 https://pbs.twimg.com/media/Ce7LlUeUUAEQkQl.jpg 1 Dandie Dinmont 0.740229 April Friday
936 714141408463036416 2016-03-27 17:25:54+00:00 Twitter for iPhone I know we only rate dogs, but since it's Easter I guess we could rate a bunny for a change. 10/10 petable as hell NaN https://t.co/O2RlKXigHu NaN 10.0 4673 1569 https://pbs.twimg.com/media/Cekj0qwXEAAHcS6.jpg 1 Labrador Retriever 0.586951 March Sunday
944 712717840512598017 2016-03-23 19:09:09+00:00 Twitter for iPhone Please don't send in any more polar bears. We only rate dogs. Thank you... 10/10 NaN https://t.co/83RGhdIQz2 NaN 10.0 13474 5616 https://pbs.twimg.com/media/CeQVF1eVIAAJaTv.jpg 1 Great Pyrenees 0.732043 March Wednesday
993 708130923141795840 2016-03-11 03:22:23+00:00 Twitter for iPhone Please stop sending in non canines like this Guatemalan Twiggle Bunny. We only rate dogs. Only send in dogs... 11/10 NaN https://t.co/XKhobeGuvT NaN 11.0 3707 943 https://pbs.twimg.com/media/CdPJUWIWIAAIchl.jpg 1 French Bulldog 0.710354 March Friday
1054 703041949650034688 2016-02-26 02:20:37+00:00 Twitter for iPhone This is an East African Chalupa Seal. We only rate dogs. Please only send in dogs. Thank you... 10/10 NaN https://t.co/iHe6liLwWR NaN 10.0 28996 14198 https://pbs.twimg.com/media/CcG07BYW0AErrC9.jpg 1 NaN 0.000000 February Friday
1074 700747788515020802 2016-02-19 18:24:26+00:00 Twitter for iPhone We only rate dogs. Pls stop sending in non-canines like this Mongolian grass snake. This is very frustrating. 11/10 NaN https://t.co/22x9SbCYCU NaN 11.0 25130 10673 https://pbs.twimg.com/media/CbmOY41UAAQylmA.jpg 1 Great Pyrenees 0.481333 February Friday
1121 697259378236399616 2016-02-10 03:22:44+00:00 Twitter for iPhone Please stop sending in saber-toothed tigers. This is getting ridiculous. We only rate dogs.\n...8/10 NaN https://t.co/iAeQNueou8 NaN 8.0 3611 1136 https://pbs.twimg.com/media/Ca0ps3AXEAAnp9m.jpg 1 Great Dane 0.999223 February Wednesday
1195 690360449368465409 2016-01-22 02:28:52+00:00 Twitter for iPhone Stop sending in lobsters. This is the final warning. We only rate dogs. Thank you... 9/10 NaN https://t.co/B9ZXXKJYNx NaN 9.0 2925 1006 https://pbs.twimg.com/media/CZSnKw8WwAAAN7q.jpg 1 Pug 0.686933 January Friday
1229 688116655151435777 2016-01-15 21:52:49+00:00 Twitter for iPhone Please send dogs. I'm tired of seeing other stuff like this dangerous pirate. We only rate dogs. Thank you... 10/10 NaN https://t.co/YdLytdZOqv NaN 10.0 3093 888 https://pbs.twimg.com/media/CYyucekVAAESj8K.jpg 1 Pug 0.973819 January Friday
1267 685532292383666176 2016-01-08 18:43:29+00:00 Twitter for iPhone For the last time, WE. DO. NOT. RATE. BULBASAUR. We only rate dogs. Please only send dogs. Thank you ...9/10 NaN https://t.co/GboDG8WhJG NaN 9.0 3336 1298 https://pbs.twimg.com/media/CYN_-6iW8AQhPu2.jpg 1 Collie 0.095805 January Friday
1340 681297372102656000 2015-12-28 02:15:26+00:00 Twitter for iPhone This is actually a lion. We only rate dogs. For the last time please only send dogs. Thank u.\n12/10 would still pet NaN https://t.co/Pp26dMQxap NaN 12.0 3490 1091 https://pbs.twimg.com/media/CXR0WJ_W8AMd_O8.jpg 1 Lhasa 0.482401 December Monday
1346 680970795137544192 2015-12-27 04:37:44+00:00 Twitter for iPhone I thought I made this very clear. We only rate dogs. Stop sending other things like this shark. Thank you... 9/10 NaN https://t.co/CXSJZ4Stk3 NaN 9.0 2665 749 https://pbs.twimg.com/media/CXNLU6wWkAE0OkJ.jpg 1 Pug 0.713102 December Sunday
1382 679530280114372609 2015-12-23 05:13:38+00:00 Twitter for iPhone Guys this really needs to stop. We've been over this way too many times. This is a giraffe. We only rate dogs.. 7/10 NaN https://t.co/yavgkHYPOC NaN 7.0 5208 2347 https://pbs.twimg.com/media/CW4tL1vWcAIw1dw.jpg 1 Dalmatian 0.750256 December Wednesday
1389 679148763231985668 2015-12-22 03:57:37+00:00 Twitter for iPhone I know everyone's excited for Christmas but that doesn't mean you can send in reindeer. We only rate dogs... 8/10 NaN https://t.co/eWjWgbOCYL NaN 8.0 3028 1163 https://pbs.twimg.com/media/CWzSMmAWsAAyB1u.jpg 1 Italian Greyhound 0.302685 December Tuesday
1486 675534494439489536 2015-12-12 04:35:48+00:00 Twitter for iPhone Seriously guys?! Only send in dogs. I only rate dogs. This is a baby black bear... 11/10 NaN https://t.co/H7kpabTfLj NaN 11.0 1953 470 https://pbs.twimg.com/media/CV_7CV6XIAEV05u.jpg 1 Chow 0.749368 December Saturday
1508 675109292475830276 2015-12-11 00:26:12+00:00 Twitter for iPhone C'mon guys. We've been over this. We only rate dogs. This is a cow. Please only submit dogs. Thank you...... 9/10 NaN https://t.co/WjcELNEqN2 NaN 9.0 3006 1259 https://pbs.twimg.com/media/CV54UQTXAAAGf-j.jpg 1 Dalmatian 0.989519 December Friday
1562 673906403526995968 2015-12-07 16:46:21+00:00 Twitter for iPhone Guys I'm getting real tired of this. We only rate dogs. Please don't send in other things like this Bulbasaur. 3/10 NaN https://t.co/t5rQHl6W8M NaN 3.0 3406 1799 https://pbs.twimg.com/media/CVoySqoWUAAWb7N.jpg 1 Soft-Coated Wheaten Terrier 0.048928 December Monday
In [81]:
# collect indices
#indices = master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')].index.tolist()

# drop the rows master_dataset.index[indices]
master_dataset.drop(not_dogs.index,axis= 0,inplace=True)

Test

In [82]:
# check for only rate dogs if still exists
len(master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')])
Out[82]:
0
In [83]:
# check the new shape
master_dataset.shape
Out[83]:
(1917, 16)
In [84]:
# check for else
master_dataset.loc[master_dataset.text.str.match('.*only rate dogs'),['text','dog_name']]
Out[84]:
text dog_name
1372 Meet Sammy. At first I was like "that's a snowflake. we only rate dogs," but he would've melted by now, so 10/10 Sammy
In [85]:
# So sucess! as above the text ensure that this is a real dog! 

Check Outliers

In [86]:
master_dataset[master_dataset.dog_score >14]
Out[86]:
tweet_id timestamp source text dog_name short_urls dog_stage dog_score favorite_count retweet_count jpg_url img_num dog_breed confidence month day_name
721 749981277374128128 2016-07-04 15:00:45+00:00 TweetDeck This is Atticus. He's quite simply America af. 1776/10 Atticus https://t.co/GRXwMxLBkh NaN 1776.0 5569 2772 https://pbs.twimg.com/media/CmgBZ7kWcAAlzFD.jpg 1 NaN 0.0 July Monday
1695 670842764863651840 2015-11-29 05:52:33+00:00 Twitter for iPhone After so many requests... here you go.\n\nGood dogg. 420/10 NaN https://t.co/yfAAo1gdeY NaN 420.0 7989 4324 https://pbs.twimg.com/media/CU9P717W4AAOlKx.jpg 1 NaN 0.0 November Sunday
  • It seems that we have funny outliers here the Atticuss gets the highest score in celebration of Indenpence day, so the score here related to the occassion and his dress
  • The second one is also a funny joke as this is not a pic of a real dog, this singer nickname is snoop dogg #### Define
  • So we are going to drop these outliers #### Code
In [87]:
# drop outliers
outliers = master_dataset[master_dataset.dog_score >14].index.tolist()
master_dataset.drop(outliers,axis = 0, inplace=True)

Test

In [88]:
# check for the master data shape
master_dataset.shape
Out[88]:
(1915, 16)
In [89]:
master_dataset[master_dataset.dog_score>14]
Out[89]:
tweet_id timestamp source text dog_name short_urls dog_stage dog_score favorite_count retweet_count jpg_url img_num dog_breed confidence month day_name

Final Check to the Tidy Master Dataset

In [90]:
#check for the final master
master_dataset.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1915 entries, 0 to 1969
Data columns (total 16 columns):
tweet_id          1915 non-null object
timestamp         1915 non-null datetime64[ns, UTC]
source            1915 non-null object
text              1915 non-null object
dog_name          1347 non-null object
short_urls        1915 non-null object
dog_stage         303 non-null category
dog_score         1915 non-null float64
favorite_count    1915 non-null int64
retweet_count     1915 non-null int64
jpg_url           1915 non-null object
img_num           1915 non-null int64
dog_breed         1617 non-null object
confidence        1915 non-null float64
month             1915 non-null object
day_name          1915 non-null object
dtypes: category(1), datetime64[ns, UTC](1), float64(2), int64(3), object(9)
memory usage: 241.6+ KB

Store Data

In [91]:
# Store the data after combinig and cleaning 
master_dataset.to_csv('twitter_archive_master.csv',encoding='utf-8',index=False)

Visualization & Analysis

1 What's the most source used by followers to share their dog's nice photo ?

In [92]:
plt.title("Distribution of Tweets'Source")
master_dataset.source.value_counts().sort_values().plot(kind ='barh')
plt.xlabel('Total Tweets')
plt.ylabel('Source');
In [93]:
# percentage of sources
master_dataset.source.value_counts() / master_dataset.source.value_counts().sum()
Out[93]:
Twitter for iPhone    0.980157
Twitter Web Client    0.014621
TweetDeck             0.005222
Name: source, dtype: float64

It is clear from the above that Twitter app in Iphone has the most share 98% which is better explained by :

  • The ease of use to take a shot for a dog from the app
  • The high resolution of cameras.
  • Nice to mention here that around 91 records of Vine source was omitted in the cleaning process as they contain no jpg_url and those counts for around only 5% so the above insight is still valid.

2 Which is the most popular day/month to post a dog photo?

In [94]:
master_dataset.day_name.value_counts().plot('bar')
plt.title("Distribution of Tweets over Days")
plt.xlabel('Week Days')
plt.ylabel('Frequency');
In [95]:
master_dataset.month.value_counts().plot('bar')
plt.title("Distribution of Tweets over Months")
plt.xlabel('Month')
plt.ylabel('Frequency');
  • It is quiet clear that people tend to post their dogs photos in Mondy/ December
    • Interestingly, most day is Mondy, may indicate that most of followers is out of stress (may be not workers)
    • The top month is December may intrepreted as the time of Christmas and New Year vacations, so people tends to go out with their dogs and take shots
    • These interpretations needs more data and investigation to be confirmed
In [96]:
# select the month and day from timestamp e.g 01/07 will be 107
master_dataset.timestamp.apply(lambda x:  x.day*100 + x.month ).value_counts().sort_values(ascending =False)[:15].plot('bar')
plt.title("Distribution of Tweets over Day/Month")
plt.xlabel('Day/Month "ddmm"')
plt.xticks(rotation = 90)
plt.ylabel('Frequency');
  • Viola ! It's quiet clear now , The most common Day/Month in our sample dataset indicate that most posts related to the end of November , which is matching with Thanksgiving Holidays, as it comes at the the Fourth Thursday of November after that Black Fridays which is also a holiday

3 Which is the most common dog name?

In [97]:
# rank the names frequency in a descending order
master_dataset.dog_name.value_counts().sort_values(ascending =False)[:10].plot('barh')
plt.title("Most Common Dogs' Names")
plt.xlabel('Frequency')
plt.ylabel("Dog's Name");
  • It is obvious here, that most used dog name in our sample is Charlie, followed by Oliver and Lucy.!
    • May be that people tend to use real names for their dogs

4 What is the most common dog stage?

In [98]:
# select the dog_ stage frequencies
master_dataset.dog_stage.value_counts().plot('bar')
plt.title("Distribution of Dog Stages")
plt.xlabel('Dog Stage')
plt.ylabel('Frequency');
  • As per the dogotionary Pupper is : a small doggo, here Pupper is the most common dog stage in our dataset
  • This may be due to dogs at this stage/age prefered by owners.

Caveats: We need to note that some values for the dog stage is missed may be because not known by the account itself and/or the dog's owner

5 How do @WeRateDogs account rate dogs?

In [99]:
# histogram for the dog score
master_dataset.dog_score.hist(bins=15)
plt.title('Distribution of Dog scores')
plt.xlabel('Scores')
plt.ylabel('Count')
plt.axvline(x= master_dataset.dog_score.mean(),color='orange', linestyle='--',label ='Mean')
plt.xticks(np.arange(15))
plt.legend(loc=0);
In [100]:
# descriptive stats
master_dataset.dog_score.describe()
Out[100]:
count    1915.000000
mean     10.547666  
std      2.175686   
min      0.000000   
25%      10.000000  
50%      11.000000  
75%      12.000000  
max      14.000000  
Name: dog_score, dtype: float64
  • We can notice from the above plot that the most frequent score is arround 12, and the maximun is 14
  • Although the rating system for the account is /10 but actually the average rating is 10.55!
  • One of the most causes for this account poularity they tend to give higher scores i.e above 10, Brent was right!

6 Which is the most common breed?

  • Note : I need to iterate here over the cleaning process to fix the name of breeds (remove underscore and titled breed names)
In [101]:
# frequency for dog breeds
master_dataset.dog_breed.value_counts()[:10].plot('bar')
plt.title('Distribution of Dog Breeds')
plt.xlabel('Breeds')
plt.ylabel('Count');
  • Here, The most common breed in our sample is Golden Retriever

    Caveats:

    • The breeds data contains a lot of null values
    • Also to take into consideration that this data is given by a neural network model

7 What is the account performance over time?

  • Here I need to answer the question if this twitter account getting more popularity over time or followers interest is going to decline
  • So, I used the rolling average to get more insight about performance on 30 days rolling average
In [102]:
# set a 30 days rolling average for favorite count
y1= master_dataset.favorite_count.rolling(window = 30).mean()
# set a 30 days rolling average for retweet count
y2= master_dataset.retweet_count.rolling(window = 30).mean()
x = master_dataset.timestamp
plt.plot(x,y1)
plt.plot(x,y2)
plt.xticks(rotation = 90)
plt.title('30 days Rolling Average Account Performance')
plt.xlabel('Dates')
plt.ylabel('30 days average')
plt.legend(loc=0);

It's quiet clear here that the @WeRateDogs account is getting more popular overtime, noticed by the increase number of likes(favorite_count)

  • Also it is quiet clear that folowers tends to like more than to retweet

8 Who is the top retweeted and/or favorite dog?

In [103]:
def get_photo(param):
    """ 
    get photo and numbers of the top of param after sorting descendingly.
    
    INPUT: param as one of our dataset columns
    ----------------------------------------------
    OUTPUT: 
            image saved from the jpg_url link
            print out the numbers for the top
    """
    winner = master_dataset.loc[master_dataset[param].sort_values(ascending =False).index[0]]
    r = requests.get(winner['jpg_url'])
    i =Image.open(BytesIO(r.content))
    i.save('.\images/'+f'top_of_{param}.jpg')
    print(f'Top {param} is: {winner[param]}')
    
In [104]:
get_photo('favorite_count')
Top favorite_count is: 132810
In [105]:
get_photo('retweet_count')
Top retweet_count is: 79515

Top Favorite count winner with 132,810 likes

Top Retweet count winner with 79,515 retweets

In [106]:
# final winner
# get the winner who has the largest retweet_count and the favorite_count
max_retweet , max_favorite = master_dataset.favorite_count.groupby(master_dataset['retweet_count']).value_counts().\
                            sort_values(ascending=False).index[0]

winner = master_dataset.query('favorite_count == @max_favorite & retweet_count == @max_retweet')

r = requests.get(winner['jpg_url'].item())
i =Image.open(BytesIO(r.content))
i.save('.\images\winner.jpg')
# prin the final result
print(f"No of retweets is : {winner['retweet_count'].item()}, \nNo of favorite_count is {winner['favorite_count'].item()}")
No of retweets is : 79515, 
No of favorite_count is 131075
In [107]:
# winner dog-score
winner['dog_score']
Out[107]:
768    13.0
Name: dog_score, dtype: float64

This awesome dogs prove his capability to swim in a pool so, he catched the attenion of followers either by retweet or give him a like

9 How @WeRateDogs account write their posts? (DogCloud)

In [108]:
text = master_dataset.text.to_string(index =False).replace('/','').strip()
In [109]:
#select the text
def wordzcloud(text):
    #text =df_sql.review[0]

    # choose the mask from a google dog pictures
    url = 'https://cdn.pixabay.com/photo/2013/11/28/11/32/dog-220324_960_720.jpg'
    r = requests.get(url)
    mask = np.array(Image.open(BytesIO(r.content)))

    # set stopwords
    stopwords = ('This','and','is','the','to')#set(STOPWORDS)
    # set other parameters
    wc = WordCloud(background_color= 'white',
                  mask = mask,
                  stopwords=stopwords,
                  max_words=100,
                  contour_color='blue')
    # generate the word cloud
    wc.generate(text)
    # show the image
    wc.to_file('.\images\dog_cloud.jpg')
    return wc.to_image()
In [110]:
# generate the word cloud from text
wordzcloud(text)
Out[110]:

Intersting Notes Here:

  • The admin uses friendly words as Meet, Say Hello, Here ,love,like...
  • The appearance of He is more than She which indicates the gender of dog is more for male dogs
  • The presence of Pupper dog stage is much larger than doggo, which is matching with my analysis.


A work by Mohamed Hindam

hindamosh@gmail.com