Facebook performance metrics

Data Preprocessing

In [48]:
import numpy as np
import pandas as pd
from keras.utils import np_utils
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression
In [2]:
df = pd.read_csv("dataset_Facebook.csv",";")
df.head()
Out[2]:
Page total likes Type Category Post Month Post Weekday Post Hour Paid Lifetime Post Total Reach Lifetime Post Total Impressions Lifetime Engaged Users Lifetime Post Consumers Lifetime Post Consumptions Lifetime Post Impressions by people who have liked your Page Lifetime Post reach by people who like your Page Lifetime People who have liked your Page and engaged with your post comment like share Total Interactions
0 139441 Photo 2 12 4 3 0.0 2752 5091 178 109 159 3078 1640 119 4 79.0 17.0 100
1 139441 Status 2 12 3 10 0.0 10460 19057 1457 1361 1674 11710 6112 1108 5 130.0 29.0 164
2 139441 Photo 3 12 3 3 0.0 2413 4373 177 113 154 2812 1503 132 0 66.0 14.0 80
3 139441 Photo 2 12 2 10 1.0 50128 87991 2211 790 1119 61027 32048 1386 58 1572.0 147.0 1777
4 139441 Photo 2 12 2 3 0.0 7244 13594 671 410 580 6228 3200 396 19 325.0 49.0 393

Exploratory Data Analysis (EDA)

Correlation Matrix

In [5]:
plt.figure(figsize=(7,7))
sns.heatmap(df.corr(),annot=True,cbar=False)
Out[5]:
<matplotlib.axes._subplots.AxesSubplot at 0x2dc88131d30>

Total Page Likes

In [8]:
plt.hist(df.iloc[:,0])
plt.title("Total pages likes")
Out[8]:
Text(0.5, 1.0, 'Total pages likes')

Category

In [10]:
objects = ('Photos', 'Status', 'Link', 'Video')
y_pos = np.arange(len(objects))

plt.bar(y_pos, df.iloc[:,1].value_counts().values)
plt.xticks(y_pos, objects)
plt.ylabel('Number of posts')
plt.title('Different types of post compared')
Out[10]:
Text(0.5, 1.0, 'Different types of post compared')
In [11]:
plt.figure(figsize=(7,7))

sns.countplot(x='Type',hue='Paid',data=df)
plt.title("Number of posts: Paid vs Not Paid")
plt.ylabel("Number of posts")
Out[11]:
Text(0, 0.5, 'Number of posts')

Post Month

In [12]:
plt.figure(figsize=(7,7))

objects = ('Jenuary', 'Febraury', 'March', 'April','May','June','July','August','September','October','November','December')
y_pos = np.arange(len(objects))

plt.bar(y_pos, df.iloc[:,3].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each month')
Out[12]:
Text(0.5, 1.0, 'Number of posts for each month')

Post Weekday

In [13]:
plt.figure(figsize=(7,7))

objects = ('Monday', 'Tuesday', 'Wednsday', 'Thoursday','Friday','Saturday','Sunday')
y_pos = np.arange(len(objects))

plt.bar(y_pos, df.iloc[:,4].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each week-day')
Out[13]:
Text(0.5, 1.0, 'Number of posts for each week-day')

Post Hour

In [15]:
plt.figure(figsize=(7,7))

objects = (np.arange(24))
y_pos = np.arange(len(objects))
# print(len(y_pos))
# print(len(a))
plt.bar(y_pos, a)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each hour of the day')
Out[15]:
Text(0.5, 1.0, 'Number of posts for each hour of the day')
In [16]:
plt.figure(figsize=(7,7))

objects = ('Free', 'Paid')
y_pos = np.arange(len(objects))

plt.bar(y_pos, df.iloc[:,6].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts: Free vs Paid')
Out[16]:
Text(0.5, 1.0, 'Number of posts: Free vs Paid')
In [17]:
plt.figure(figsize=(7,7))
sns.countplot(x='Post Weekday',hue='Paid',data=df)
plt.title("Number of posts: Free vs Paid")
Out[17]:
Text(0.5, 1.0, 'Number of posts: Free vs Paid')

Comments

In [18]:
plt.hist(df.iloc[:,len(df.columns)-4])
plt.title("Comments per post distribution")
Out[18]:
Text(0.5, 1.0, 'Comments per post distribution')

Likes

In [19]:
plt.hist(df.iloc[:,len(df.columns)-3])
plt.title("Likes per post distribution")
C:\Users\hp\Anaconda3\lib\site-packages\numpy\lib\histograms.py:824: RuntimeWarning: invalid value encountered in greater_equal
  keep = (tmp_a >= first_edge)
C:\Users\hp\Anaconda3\lib\site-packages\numpy\lib\histograms.py:825: RuntimeWarning: invalid value encountered in less_equal
  keep &= (tmp_a <= last_edge)
Out[19]:
Text(0.5, 1.0, 'Likes per post distribution')

Shares

In [20]:
plt.hist(df.iloc[:,len(df.columns)-2])
plt.title("Share per post distribution")
Out[20]:
Text(0.5, 1.0, 'Share per post distribution')

Lifetime Data Analysis

In [21]:
dfplot = df.drop(df.columns[7:15],axis =1)
sns.pairplot(data=dfplot)
plt.figure(figsize=(7,7))
Out[21]:
<Figure size 504x504 with 0 Axes>
<Figure size 504x504 with 0 Axes>

Machine Learning

In [22]:
lb_make = LabelEncoder()
df["Type"] = lb_make.fit_transform(df["Type"])
df = df.fillna(0)
df.head(5)
Out[22]:
Page total likes Type Category Post Month Post Weekday Post Hour Paid Lifetime Post Total Reach Lifetime Post Total Impressions Lifetime Engaged Users Lifetime Post Consumers Lifetime Post Consumptions Lifetime Post Impressions by people who have liked your Page Lifetime Post reach by people who like your Page Lifetime People who have liked your Page and engaged with your post comment like share Total Interactions
0 139441 1 2 12 4 3 0.0 2752 5091 178 109 159 3078 1640 119 4 79.0 17.0 100
1 139441 2 2 12 3 10 0.0 10460 19057 1457 1361 1674 11710 6112 1108 5 130.0 29.0 164
2 139441 1 3 12 3 3 0.0 2413 4373 177 113 154 2812 1503 132 0 66.0 14.0 80
3 139441 1 2 12 2 10 1.0 50128 87991 2211 790 1119 61027 32048 1386 58 1572.0 147.0 1777
4 139441 1 2 12 2 3 0.0 7244 13594 671 410 580 6228 3200 396 19 325.0 49.0 393
In [36]:
X = df.drop(['like'], axis = 1).values
Y = df['like'].values

X = StandardScaler().fit_transform(X)

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101)

SVM Feature Importance

In [37]:
from sklearn import svm
import matplotlib.pyplot as plt
def feature_plot(classifier, feature_names, top_features=4):
    coef = classifier.coef_.ravel()
    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
    plt.figure(figsize=(18, 7))
    colors = ['green' if c < 0 else 'blue' for c in coef[top_coefficients]]
    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    #plt.xticks(np.arange(1 + 2 * top_features), feature_names[top_coefficients], rotation=45, ha='right')
    plt.show()

print(df.drop(['like'], axis = 1).columns.values)

trainedsvm = svm.LinearSVC().fit(X, Y)
feature_plot(trainedsvm, df.drop(['like'], axis = 1).columns.values)
['Page total likes' 'Type' 'Category' 'Post Month' 'Post Weekday'
 'Post Hour' 'Paid' 'Lifetime Post Total Reach'
 'Lifetime Post Total Impressions' 'Lifetime Engaged Users'
 'Lifetime Post Consumers' 'Lifetime Post Consumptions'
 'Lifetime Post Impressions by people who have liked your Page'
 'Lifetime Post reach by people who like your Page'
 'Lifetime People who have liked your Page and engaged with your post'
 'comment' 'share' 'Total Interactions']
C:\Users\hp\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

Linear Regression

In [38]:
reg = LinearRegression().fit(X_Train,Y_Train)
reg.score(X_Train,Y_Train)
Out[38]:
1.0

Random Forest

In [50]:
randomforest = RandomForestRegressor(n_estimators=500,min_samples_split=10)
randomforest.fit(X_Train,Y_Train)

p_train = rf.predict(X_Train)
p_test = rf.predict(X_Test)

train_acc = r2_score(Y_Train, p_train)
test_acc = r2_score(Y_Test, p_test)

print("Training Score: ", train_acc)
print("Test Score: ", test_score)
Training Score:  0.9184766137141158
Test Score:  0.9863946408914626