# import the needed libraries
import pandas as pd
import numpy as np
import requests
import zipfile
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import seaborn as sns
plt.style.use('bmh')
from PIL import Image
from io import BytesIO
from wordcloud import WordCloud, STOPWORDS
import requests
import numpy as np
%matplotlib inline
# read the provided csv file twitter-archive-enhanced file (file on hand) and save it
twitter_archive = pd.read_csv('twitter-archive-enhanced.csv')
# read the first 5 rows for data inspection
twitter_archive.head()
# getting the image prediction file programmatically using the given url
url = 'https://d17h27t6h515a5.cloudfront.net/topher/2017/August/599fd2ad_image-predictions/image-predictions.tsv'
response = requests.get(url)
# save to .tsv file
with open('image_predictions.tsv', 'wb') as file:
file.write(response.content)
# read the image prediction file and save to pandas DataFrame
image_pred = pd.read_csv('image_predictions.tsv',sep='\t')
# check for the data top 5 rows
image_pred.head()
I tried to set up a twitter developer account, but my application was not approved.
# Query Twitter API for each tweet in the Twitter archive and save JSON in a text file
# These are hidden to comply with Twitter's API terms and conditions
#consumer_key = 'HIDDEN'
#consumer_secret = 'HIDDEN'
#access_token = 'HIDDEN'
#access_secret = 'HIDDEN'
#auth = OAuthHandler(consumer_key, consumer_secret)
#auth.set_access_token(access_token, access_secret)
#api = tweepy.API(auth, wait_on_rate_limit=True)
# NOTE TO REVIEWER: this student had mobile verification issues so the following
# Twitter API code was sent to this student from a Udacity instructor
# Tweet IDs for which to gather additional data via Twitter's API
#tweet_ids = twitter_archive.tweet_id.values
#len(tweet_ids)
# Query Twitter's API for JSON data for each tweet ID in the Twitter archive
#count = 0
#fails_dict = {}
#start = timer()
# Save each tweet's returned JSON as a new line in a .txt file
#with open('tweet_json.txt', 'w') as outfile:
# This loop will likely take 20-30 minutes to run because of Twitter's rate limit
# for tweet_id in tweet_ids:
# count += 1
# print(str(count) + ": " + str(tweet_id))
# try:
# tweet = api.get_status(tweet_id, tweet_mode='extended')
# print("Success")
# json.dump(tweet._json, outfile)
# outfile.write('\n')
# except tweepy.TweepError as e:
# print("Fail")
# fails_dict[tweet_id] = e
# pass
#end = timer()
#print(end - start)
#print(fails_dict)
The data that should be gathered by the previous code is supported in the project resources by Udacity as zip file.
# extract the file from the zipfile
with open('tweet-json.zip','rb') as f:
z_tweets = zipfile.ZipFile(f)
z_tweets.extractall()
# check for the extracted file
z_tweets.namelist()
# read the file in DataFrame
with open('tweet-json copy', 'r') as f:
tweet_json = pd.read_json(f, lines= True, encoding = 'utf-8')
# check the data
tweet_json.head(3)
# check for the columns names
tweet_json.columns
# select the columns of interest : 'id', 'favorite_count','retweet_count'
tweet_json = tweet_json.loc[:,['id','favorite_count','retweet_count']]
# check for the top 5 rows
tweet_json.head()
twitter_archive
, img_pred
and tweet_json
# display twitter archive
twitter_archive
# display image_pred
image_pred
# display tweet_json
tweet_json
# twitter_archive data info
twitter_archive.info()
# statistic description of twitter archive
twitter_archive.describe()
# data sample
twitter_archive.sample(5)
# check for source column
twitter_archive.source.value_counts()
# check for the dog's name written style
twitter_archive.name.str.istitle().value_counts()
# check for those written as lowercase
lowers = twitter_archive.name.loc[twitter_archive.name.str.islower()].unique()
lowers
# check for the unique values of those non titled
untitled = twitter_archive.name.loc[twitter_archive.name.str.istitle() == False].unique()
untitled
# check for those mis-written
untitled_unlowers = [i for i in untitled if i not in lowers]
untitled_unlowers
As we are interested in this project with the rating of Dogs so Let's focus more on the columns related to rating i.e rating_numerator
and rating_denominator
# check for denominator values below 10
pd.set_option('display.max_colwidth',-1)
twitter_archive.loc[twitter_archive.rating_denominator <10 , ['text','rating_numerator','rating_denominator']]
# check for rating denominator values > 10
twitter_archive.loc[twitter_archive.rating_denominator >10 ,['text','rating_numerator','rating_denominator']]
# check for rating_numerator <10
twitter_archive.loc[twitter_archive.rating_numerator < 10,['text','rating_numerator','rating_denominator']]
# check for rating_numerator values > 10
twitter_archive.loc[twitter_archive.rating_numerator > 14,['text','rating_numerator','rating_denominator']]
Important points here:
# check for the text
twitter_archive.text.sample(5).tolist()
We only rate dogs
, Let's invistigate about this# check inside the text values for non dog related tweets
twitter_archive.text[twitter_archive.text.str.match('.*only rate dogs')]
# check the expanded urls column
twitter_archive.expanded_urls.sample(5)
# check for how many time this issue occur?
len(twitter_archive.text[twitter_archive.text.str.match('.*only rate dogs')])
# image_pred data info
image_pred.info()
# statistic description of image_pred
image_pred.describe()
# data sample
image_pred.sample(5)
# number of dogs breeds
image_pred.p1.value_counts()
# tweet_json data info
tweet_json.info()
# tweet_json statistics
tweet_json.describe()
# data sample
tweet_json.sample(5)
# check for datasets shape and completeness
twitter_archive.shape[0], tweet_json.shape[0] , image_pred.shape[0]
# duplicate columns in the three datasets
all_columns = pd.Series(list(twitter_archive ) + list(tweet_json) +list(image_pred))
all_columns[all_columns.duplicated()]
twitter_archive
Missing Values :
We are interested in the tweet ONLY not the reply to the original tweet
tweet_id is saved as int datatype instead of/ "better to be" string (object)
<a>
tagscolumn name :
untitled_unlowers
('BeBe','DonDon','CeCe',, 'JD', 'DayZ')lowers
['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad','an', 'very', 'just', 'my', 'his', 'actually', 'getting','this', 'unacceptable', 'all', 'old', 'infuriating', 'the','by', 'officially', 'life', 'light', 'space']
rating_numerator & rating_denominator:
image_pred
twitter_json
All_datasets
twitter_archive
twitter_archive
dataset should be in one column dog_stage with a category datatype.twitter_archive
dataset should form one column dog_rating normalized out of 10.day_name
and month
from the timstamp columnimage_pred
dog_breed
and confidence
All datasets
tweet_json
and image_pred
datasets should be part of our main dataset twitter_archive
.# make a copy of the datasets
twitter_archive_clean = twitter_archive.copy()
image_pred_clean = image_pred.copy()
tweet_json_clean = tweet_json.copy()
First things first
1
Missing Values¶twitter_archive
Missing Values :
twitter_archive
dataset we will keep only recodrs that:1
Are not associated with retweets.2
Are not associated with reply to the original tweet.twitter_archive_clean = twitter_archive_clean.query('in_reply_to_status_id == "NaN" &\
in_reply_to_user_id == "NaN" &\
retweeted_status_id == "NaN" &\
retweeted_status_user_id == "NaN"')
# drop columns
xcolumns = ['in_reply_to_status_id','in_reply_to_user_id','retweeted_status_id',
'retweeted_status_user_id', 'retweeted_status_timestamp']
twitter_archive_clean = twitter_archive_clean.drop(columns = xcolumns, axis=1)
# check for Null values in the twitter_archive clean versions
twitter_archive_clean.info()
1
twitter_archive
' '
over the text column, and apply over rowshort_urls
columnexpanded_urls
columnhttps:
and assign its value to the same column name# create short_urls column by use split method over the text column, and apply over row
twitter_archive_clean['short_urls'] = twitter_archive_clean.text.apply(lambda x :x.strip().split(' ')[-1])
# drop the expanded_urls
twitter_archive_clean.drop('expanded_urls', axis =1, inplace=True)
# split the text column by `https:` and assign its value to the same column name
twitter_archive_clean.text = twitter_archive_clean.text.apply(lambda x:x.split('https:')[0])
twitter_archive_clean.sample()
# check for column droping
assert 'expanded_urls' not in twitter_archive_clean.columns
2
twitter_archive
twitter_archive
dataset should be in one column dog_stage with a category datatype.# select the dog stages columns from the dataset
all_dogs_type = ['doggo', 'floofer', 'pupper', 'puppo']
# replace the 'None' string with np.nan
twitter_archive_clean[all_dogs_type] = twitter_archive_clean[all_dogs_type].replace('None', np.nan)
# create the dog_stage column with joining the four columns in one column dog_stage join for more than stage
twitter_archive_clean['dog_stage'] = twitter_archive_clean[all_dogs_type].\
apply(lambda x: ', '.join(x.dropna().astype(str)),axis =1)
# replace the empty string with nan and change datatype to category
twitter_archive_clean.dog_stage = twitter_archive_clean.dog_stage.replace('', np.nan).astype('category')
# drop the 4 columns
twitter_archive_clean = twitter_archive_clean.drop(columns = all_dogs_type, axis =1)
# check for the data columns and datatype
twitter_archive_clean.info()
# check for the values of the new column
twitter_archive_clean.dog_stage.value_counts()
#convert the datatype of rating_numerator to float by astype('float')
twitter_archive_clean.rating_numerator = twitter_archive_clean.rating_numerator.astype('float')
# check for the datatype
twitter_archive_clean.info()
3
rating_numerator & rating_denominator:
4
rating_numerator & rating_denominator:
[516]
by index# check for index if exist
indices = [45,313,2335,1068,1165,1202,1662,695,763,1712,516,342]
for i in indices:
if i in list(twitter_archive_clean.index):
print('yes')
else:
print(f'No : {i} ')
#get a list of the indices of the erros after check
indices = [45,2335,1068,1165,1202,1662,695,763,1712]
# set a list for the correct values relative to those indices
vals = [13.5,9,14,13,11,10,9.75,11.27,11.26]
# loop through the two lists and assign each index with the new correct value
for i,val in zip(indices,vals):
twitter_archive_clean.loc[i, 'rating_numerator'] = val
twitter_archive_clean.loc[i, 'rating_denominator'] =10
# drop the index: 516
twitter_archive_clean.drop(index=516,inplace=True)
# test for value for one of the entries
assert twitter_archive_clean.loc[1712,'rating_numerator'] ==11.26
# test for droping index=516
assert 516 not in list(twitter_archive_clean.index)
twitter_archive_clean.info()
# check for the rating_denominator values
twitter_archive_clean.rating_denominator.value_counts()
3
twitter_archive
twitter_archive
dataset should form one column dog_rating normalized out of 10.#divide the rating_numerator / rating_denominator and then mulitiply by 10 & make dog_score column
twitter_archive_clean['dog_score'] = 10 * twitter_archive_clean.rating_numerator / twitter_archive_clean.rating_denominator
#drop the columns rating_numerator & rating_denominator column
twitter_archive_clean.drop(['rating_numerator','rating_denominator'],axis=1,inplace=True)
# check for values in the dog_score column
twitter_archive_clean.dog_score.value_counts()
# check for the twitter_archive_clean data
twitter_archive_clean.info()
# rename the id column in twitter_json to tweet_id
tweet_json_clean.columns = ['tweet_id', 'favorite_count', 'retweet_count']
# change the datatype to str(object) in all datasets
datasets = [twitter_archive_clean,image_pred_clean,tweet_json_clean]
for i in datasets:
i.tweet_id = i.tweet_id.astype('object')
# check for the datatypes for tweet_id in all datasets
for i in datasets:
assert i.tweet_id.dtypes == 'object'
4
image_pred
dataset condence the columns p1,p1_dog_p1_conf,...etc to dog_breed, confidence
dog_breed
and confidence
breed = []
confidence = []
# define the function
def dog_breed_confidence(data):
if data.p1_dog:
breed.append(data.p1)
confidence.append(data.p1_conf)
elif data.p2_dog:
breed.append(data.p2)
confidence.append(data.p2_conf)
elif data.p3_dog :
breed.append(data.p3)
confidence.append(data.p3_conf)
else:
breed.append('No breed')
confidence.append(0)
# apply the function row wise
image_pred_clean.apply(dog_breed_confidence,axis =1)
# assign the new column names
image_pred_clean['dog_breed'] = breed
image_pred_clean['confidence'] = confidence
# drop the un needed columns now
image_pred_clean.drop(columns = ['p1', 'p1_dog', 'p1_conf' , 'p2', 'p2_dog',
'p2_conf' , 'p3', 'p3_dog', 'p3_conf'],axis=1, inplace =True)
# rename the No breed values with np.nan
image_pred_clean.replace('No breed',np.nan, inplace=True)
# replace the underscore with space and title all breeds values
image_pred_clean.dog_breed= image_pred_clean.dog_breed.str.replace('_',' ').str.title()
# check the top 5 rows in image_pred_clean
image_pred_clean.head()
5
tweet_json
and image_pred
datasets should be part of our main dataset twitter_archive
.
tweet_json
and keeping the original datatwitter_archive_clean
and tweet_json_clean
on tweet_id column (left join)twitter_archive_clean
and image_pred_clean
on tweet_id column (inner join)# use the merge function to merge twitter_archive_clean and tweet_json_clean on tweet_id column (left join)
twitter_archive_clean = pd.merge(twitter_archive_clean, tweet_json_clean , how = 'left' , on = 'tweet_id')
# use the merge function to merge `twitter_archive_clean` and `image_pred_clean` on tweet_id column (inner join)
# and make master dataset
master_dataset = pd.merge(twitter_archive_clean, image_pred_clean , how = 'inner' , on = 'tweet_id')
# check new dataset after merge
master_dataset.info()
# check that all records have an image
master_dataset.jpg_url.isnull().sum()
# check for the unique values
master_dataset.source.unique()
#make a function fix_source which extract the strings between tags
def fix_source(i):
'i is an html string from the source column in twitter_archive_clean dataset'
#find the first closed tag >
x= i.find('>') + 1
# find the first open tag after the previous <
y =i[x:].find('<')
# extract the text in between
return i[x:][:y]
# use apply function to fix the source column row wise
master_dataset.source= master_dataset.source.apply(lambda x: fix_source(x))
# check for the result values in the source column
master_dataset.source.value_counts()
# change the datatype of timestamp column to datetime
master_dataset.timestamp = pd.to_datetime(master_dataset.timestamp)
# check for the datatype
master_dataset.timestamp.dtype
# extract the month name
master_dataset['month'] = master_dataset.timestamp.apply(lambda x: x.month_name())
#extarct the day_name
master_dataset['day_name'] = master_dataset.timestamp.apply(lambda x: x.day_name())
# check for the top 5 rows in columns timestamp, day_name and month
master_dataset.loc[:5,['timestamp','day_name','month']]
# chheck for the datatypes
master_dataset.info()
7
column name :
untitled_unlowers
('BeBe','DonDon','CeCe',, 'JD', 'DayZ')lowers
['such', 'a', 'quite', 'not', 'one', 'incredibly', 'mad','an', 'very', 'just', 'my', 'his', 'actually', 'getting','this', 'unacceptable', 'all', 'old', 'infuriating', 'the','by', 'officially', 'life', 'light', 'space']
# rename the name column to dog_name
master_dataset.rename(columns={'name':'dog_name'},inplace=True)
# converted lower names to np.nan
lowers = master_dataset.dog_name.str.islower()
master_dataset.loc[lowers,'dog_name'] = 'None'
# make all values titled
master_dataset.dog_name = master_dataset.dog_name.apply(lambda x: x.title())
# relace 'None' with np.nan values
master_dataset.dog_name.replace('None', np.nan, inplace= True)
# check for all is titled
master_dataset.dog_name.str.istitle().value_counts()
# assert for our work
assert [i.title() in master_dataset.dog_name.unique() for i in untitled_unlowers]
assert [i in master_dataset.dog_name.unique() for i in lowers]
assert 'dog_name' in master_dataset.columns
# check for dog_name frequencies
master_dataset.dog_name.value_counts()
# check for data names
master_dataset.info()
8
We are interested in dogs , text
column reveals the truth about that some tweets are not related to dogs
only rate dogs
as it is used by the account admin to address that photo is not a dog# check for the text column for only rate dogs in text and null value for dog_name
not_dogs = master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')]
# check for number of records
len(not_dogs)
# explore data
not_dogs
# collect indices
#indices = master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')].index.tolist()
# drop the rows master_dataset.index[indices]
master_dataset.drop(not_dogs.index,axis= 0,inplace=True)
# check for only rate dogs if still exists
len(master_dataset.loc[master_dataset.dog_name.isnull()& master_dataset.text.str.match('.*only rate dogs')])
# check the new shape
master_dataset.shape
# check for else
master_dataset.loc[master_dataset.text.str.match('.*only rate dogs'),['text','dog_name']]
# So sucess! as above the text ensure that this is a real dog!
master_dataset[master_dataset.dog_score >14]
# drop outliers
outliers = master_dataset[master_dataset.dog_score >14].index.tolist()
master_dataset.drop(outliers,axis = 0, inplace=True)
# check for the master data shape
master_dataset.shape
master_dataset[master_dataset.dog_score>14]
#check for the final master
master_dataset.info()
# Store the data after combinig and cleaning
master_dataset.to_csv('twitter_archive_master.csv',encoding='utf-8',index=False)
1
What's the most source used by followers to share their dog's nice photo ?¶plt.title("Distribution of Tweets'Source")
master_dataset.source.value_counts().sort_values().plot(kind ='barh')
plt.xlabel('Total Tweets')
plt.ylabel('Source');
# percentage of sources
master_dataset.source.value_counts() / master_dataset.source.value_counts().sum()
It is clear from the above that Twitter app in Iphone has the most share 98% which is better explained by :
- The ease of use to take a shot for a dog from the app
- The high resolution of cameras.
2
Which is the most popular day/month to post a dog photo?¶master_dataset.day_name.value_counts().plot('bar')
plt.title("Distribution of Tweets over Days")
plt.xlabel('Week Days')
plt.ylabel('Frequency');
master_dataset.month.value_counts().plot('bar')
plt.title("Distribution of Tweets over Months")
plt.xlabel('Month')
plt.ylabel('Frequency');
- Interestingly, most day is Mondy, may indicate that most of followers is out of stress (may be not workers)
- The top month is December may intrepreted as the time of Christmas and New Year vacations, so people tends to go out with their dogs and take shots
- These interpretations needs more data and investigation to be confirmed
# select the month and day from timestamp e.g 01/07 will be 107
master_dataset.timestamp.apply(lambda x: x.day*100 + x.month ).value_counts().sort_values(ascending =False)[:15].plot('bar')
plt.title("Distribution of Tweets over Day/Month")
plt.xlabel('Day/Month "ddmm"')
plt.xticks(rotation = 90)
plt.ylabel('Frequency');
- Viola ! It's quiet clear now , The most common Day/Month in our sample dataset indicate that most posts related to the end of November , which is matching with Thanksgiving Holidays, as it comes at the the Fourth Thursday of November after that Black Fridays which is also a holiday
3
Which is the most common dog name?¶# rank the names frequency in a descending order
master_dataset.dog_name.value_counts().sort_values(ascending =False)[:10].plot('barh')
plt.title("Most Common Dogs' Names")
plt.xlabel('Frequency')
plt.ylabel("Dog's Name");
- May be that people tend to use real names for their dogs
4
What is the most common dog stage?¶# select the dog_ stage frequencies
master_dataset.dog_stage.value_counts().plot('bar')
plt.title("Distribution of Dog Stages")
plt.xlabel('Dog Stage')
plt.ylabel('Frequency');
Caveats: We need to note that some values for the dog stage is missed may be because not known by the account itself and/or the dog's owner
5
How do @WeRateDogs account rate dogs?¶# histogram for the dog score
master_dataset.dog_score.hist(bins=15)
plt.title('Distribution of Dog scores')
plt.xlabel('Scores')
plt.ylabel('Count')
plt.axvline(x= master_dataset.dog_score.mean(),color='orange', linestyle='--',label ='Mean')
plt.xticks(np.arange(15))
plt.legend(loc=0);
# descriptive stats
master_dataset.dog_score.describe()
6
Which is the most common breed?¶# frequency for dog breeds
master_dataset.dog_breed.value_counts()[:10].plot('bar')
plt.title('Distribution of Dog Breeds')
plt.xlabel('Breeds')
plt.ylabel('Count');
Caveats:
- The breeds data contains a lot of null values
- Also to take into consideration that this data is given by a neural network model
7
What is the account performance over time?¶# set a 30 days rolling average for favorite count
y1= master_dataset.favorite_count.rolling(window = 30).mean()
# set a 30 days rolling average for retweet count
y2= master_dataset.retweet_count.rolling(window = 30).mean()
x = master_dataset.timestamp
plt.plot(x,y1)
plt.plot(x,y2)
plt.xticks(rotation = 90)
plt.title('30 days Rolling Average Account Performance')
plt.xlabel('Dates')
plt.ylabel('30 days average')
plt.legend(loc=0);
It's quiet clear here that the @WeRateDogs account is getting more popular overtime, noticed by the increase number of likes(favorite_count)
8
Who is the top retweeted and/or favorite dog?¶def get_photo(param):
"""
get photo and numbers of the top of param after sorting descendingly.
INPUT: param as one of our dataset columns
----------------------------------------------
OUTPUT:
image saved from the jpg_url link
print out the numbers for the top
"""
winner = master_dataset.loc[master_dataset[param].sort_values(ascending =False).index[0]]
r = requests.get(winner['jpg_url'])
i =Image.open(BytesIO(r.content))
i.save('.\images/'+f'top_of_{param}.jpg')
print(f'Top {param} is: {winner[param]}')
get_photo('favorite_count')
get_photo('retweet_count')
![]() Top Favorite count winner with 132,810 likes |
![]() Top Retweet count winner with 79,515 retweets |
# final winner
# get the winner who has the largest retweet_count and the favorite_count
max_retweet , max_favorite = master_dataset.favorite_count.groupby(master_dataset['retweet_count']).value_counts().\
sort_values(ascending=False).index[0]
winner = master_dataset.query('favorite_count == @max_favorite & retweet_count == @max_retweet')
r = requests.get(winner['jpg_url'].item())
i =Image.open(BytesIO(r.content))
i.save('.\images\winner.jpg')
# prin the final result
print(f"No of retweets is : {winner['retweet_count'].item()}, \nNo of favorite_count is {winner['favorite_count'].item()}")
# winner dog-score
winner['dog_score']
This awesome dogs prove his capability to swim in a pool so, he catched the attenion of followers either by retweet or give him a like
9
How @WeRateDogs account write their posts? (DogCloud)¶text = master_dataset.text.to_string(index =False).replace('/','').strip()
#select the text
def wordzcloud(text):
#text =df_sql.review[0]
# choose the mask from a google dog pictures
url = 'https://cdn.pixabay.com/photo/2013/11/28/11/32/dog-220324_960_720.jpg'
r = requests.get(url)
mask = np.array(Image.open(BytesIO(r.content)))
# set stopwords
stopwords = ('This','and','is','the','to')#set(STOPWORDS)
# set other parameters
wc = WordCloud(background_color= 'white',
mask = mask,
stopwords=stopwords,
max_words=100,
contour_color='blue')
# generate the word cloud
wc.generate(text)
# show the image
wc.to_file('.\images\dog_cloud.jpg')
return wc.to_image()
# generate the word cloud from text
wordzcloud(text)
Intersting Notes Here:
- The admin uses friendly words as Meet, Say Hello, Here ,love,like...
- The appearance of He is more than She which indicates the gender of dog is more for male dogs
- The presence of Pupper dog stage is much larger than doggo, which is matching with my analysis.