import numpy as np
import pandas as pd
from keras.utils import np_utils
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from scipy.stats import spearmanr, pearsonr
from sklearn.linear_model import LinearRegression
df = pd.read_csv("dataset_Facebook.csv",";")
df.head()
Page total likes | Type | Category | Post Month | Post Weekday | Post Hour | Paid | Lifetime Post Total Reach | Lifetime Post Total Impressions | Lifetime Engaged Users | Lifetime Post Consumers | Lifetime Post Consumptions | Lifetime Post Impressions by people who have liked your Page | Lifetime Post reach by people who like your Page | Lifetime People who have liked your Page and engaged with your post | comment | like | share | Total Interactions | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 139441 | Photo | 2 | 12 | 4 | 3 | 0.0 | 2752 | 5091 | 178 | 109 | 159 | 3078 | 1640 | 119 | 4 | 79.0 | 17.0 | 100 |
1 | 139441 | Status | 2 | 12 | 3 | 10 | 0.0 | 10460 | 19057 | 1457 | 1361 | 1674 | 11710 | 6112 | 1108 | 5 | 130.0 | 29.0 | 164 |
2 | 139441 | Photo | 3 | 12 | 3 | 3 | 0.0 | 2413 | 4373 | 177 | 113 | 154 | 2812 | 1503 | 132 | 0 | 66.0 | 14.0 | 80 |
3 | 139441 | Photo | 2 | 12 | 2 | 10 | 1.0 | 50128 | 87991 | 2211 | 790 | 1119 | 61027 | 32048 | 1386 | 58 | 1572.0 | 147.0 | 1777 |
4 | 139441 | Photo | 2 | 12 | 2 | 3 | 0.0 | 7244 | 13594 | 671 | 410 | 580 | 6228 | 3200 | 396 | 19 | 325.0 | 49.0 | 393 |
plt.figure(figsize=(7,7))
sns.heatmap(df.corr(),annot=True,cbar=False)
<matplotlib.axes._subplots.AxesSubplot at 0x2dc88131d30>
plt.hist(df.iloc[:,0])
plt.title("Total pages likes")
Text(0.5, 1.0, 'Total pages likes')
objects = ('Photos', 'Status', 'Link', 'Video')
y_pos = np.arange(len(objects))
plt.bar(y_pos, df.iloc[:,1].value_counts().values)
plt.xticks(y_pos, objects)
plt.ylabel('Number of posts')
plt.title('Different types of post compared')
Text(0.5, 1.0, 'Different types of post compared')
plt.figure(figsize=(7,7))
sns.countplot(x='Type',hue='Paid',data=df)
plt.title("Number of posts: Paid vs Not Paid")
plt.ylabel("Number of posts")
Text(0, 0.5, 'Number of posts')
plt.figure(figsize=(7,7))
objects = ('Jenuary', 'Febraury', 'March', 'April','May','June','July','August','September','October','November','December')
y_pos = np.arange(len(objects))
plt.bar(y_pos, df.iloc[:,3].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each month')
Text(0.5, 1.0, 'Number of posts for each month')
plt.figure(figsize=(7,7))
objects = ('Monday', 'Tuesday', 'Wednsday', 'Thoursday','Friday','Saturday','Sunday')
y_pos = np.arange(len(objects))
plt.bar(y_pos, df.iloc[:,4].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each week-day')
Text(0.5, 1.0, 'Number of posts for each week-day')
plt.figure(figsize=(7,7))
objects = (np.arange(24))
y_pos = np.arange(len(objects))
# print(len(y_pos))
# print(len(a))
plt.bar(y_pos, a)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts for each hour of the day')
Text(0.5, 1.0, 'Number of posts for each hour of the day')
plt.figure(figsize=(7,7))
objects = ('Free', 'Paid')
y_pos = np.arange(len(objects))
plt.bar(y_pos, df.iloc[:,6].value_counts().values)
plt.xticks(y_pos, objects, rotation=70)
plt.ylabel('Number of posts')
plt.title('Number of posts: Free vs Paid')
Text(0.5, 1.0, 'Number of posts: Free vs Paid')
plt.figure(figsize=(7,7))
sns.countplot(x='Post Weekday',hue='Paid',data=df)
plt.title("Number of posts: Free vs Paid")
Text(0.5, 1.0, 'Number of posts: Free vs Paid')
plt.hist(df.iloc[:,len(df.columns)-4])
plt.title("Comments per post distribution")
Text(0.5, 1.0, 'Comments per post distribution')
plt.hist(df.iloc[:,len(df.columns)-3])
plt.title("Likes per post distribution")
C:\Users\hp\Anaconda3\lib\site-packages\numpy\lib\histograms.py:824: RuntimeWarning: invalid value encountered in greater_equal keep = (tmp_a >= first_edge) C:\Users\hp\Anaconda3\lib\site-packages\numpy\lib\histograms.py:825: RuntimeWarning: invalid value encountered in less_equal keep &= (tmp_a <= last_edge)
Text(0.5, 1.0, 'Likes per post distribution')
plt.hist(df.iloc[:,len(df.columns)-2])
plt.title("Share per post distribution")
Text(0.5, 1.0, 'Share per post distribution')
dfplot = df.drop(df.columns[7:15],axis =1)
sns.pairplot(data=dfplot)
plt.figure(figsize=(7,7))
<Figure size 504x504 with 0 Axes>
<Figure size 504x504 with 0 Axes>
lb_make = LabelEncoder()
df["Type"] = lb_make.fit_transform(df["Type"])
df = df.fillna(0)
df.head(5)
Page total likes | Type | Category | Post Month | Post Weekday | Post Hour | Paid | Lifetime Post Total Reach | Lifetime Post Total Impressions | Lifetime Engaged Users | Lifetime Post Consumers | Lifetime Post Consumptions | Lifetime Post Impressions by people who have liked your Page | Lifetime Post reach by people who like your Page | Lifetime People who have liked your Page and engaged with your post | comment | like | share | Total Interactions | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 139441 | 1 | 2 | 12 | 4 | 3 | 0.0 | 2752 | 5091 | 178 | 109 | 159 | 3078 | 1640 | 119 | 4 | 79.0 | 17.0 | 100 |
1 | 139441 | 2 | 2 | 12 | 3 | 10 | 0.0 | 10460 | 19057 | 1457 | 1361 | 1674 | 11710 | 6112 | 1108 | 5 | 130.0 | 29.0 | 164 |
2 | 139441 | 1 | 3 | 12 | 3 | 3 | 0.0 | 2413 | 4373 | 177 | 113 | 154 | 2812 | 1503 | 132 | 0 | 66.0 | 14.0 | 80 |
3 | 139441 | 1 | 2 | 12 | 2 | 10 | 1.0 | 50128 | 87991 | 2211 | 790 | 1119 | 61027 | 32048 | 1386 | 58 | 1572.0 | 147.0 | 1777 |
4 | 139441 | 1 | 2 | 12 | 2 | 3 | 0.0 | 7244 | 13594 | 671 | 410 | 580 | 6228 | 3200 | 396 | 19 | 325.0 | 49.0 | 393 |
X = df.drop(['like'], axis = 1).values
Y = df['like'].values
X = StandardScaler().fit_transform(X)
X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101)
from sklearn import svm
import matplotlib.pyplot as plt
def feature_plot(classifier, feature_names, top_features=4):
coef = classifier.coef_.ravel()
top_positive_coefficients = np.argsort(coef)[-top_features:]
top_negative_coefficients = np.argsort(coef)[:top_features]
top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])
plt.figure(figsize=(18, 7))
colors = ['green' if c < 0 else 'blue' for c in coef[top_coefficients]]
plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
feature_names = np.array(feature_names)
#plt.xticks(np.arange(1 + 2 * top_features), feature_names[top_coefficients], rotation=45, ha='right')
plt.show()
print(df.drop(['like'], axis = 1).columns.values)
trainedsvm = svm.LinearSVC().fit(X, Y)
feature_plot(trainedsvm, df.drop(['like'], axis = 1).columns.values)
['Page total likes' 'Type' 'Category' 'Post Month' 'Post Weekday' 'Post Hour' 'Paid' 'Lifetime Post Total Reach' 'Lifetime Post Total Impressions' 'Lifetime Engaged Users' 'Lifetime Post Consumers' 'Lifetime Post Consumptions' 'Lifetime Post Impressions by people who have liked your Page' 'Lifetime Post reach by people who like your Page' 'Lifetime People who have liked your Page and engaged with your post' 'comment' 'share' 'Total Interactions']
C:\Users\hp\Anaconda3\lib\site-packages\sklearn\svm\base.py:931: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations. "the number of iterations.", ConvergenceWarning)
reg = LinearRegression().fit(X_Train,Y_Train)
reg.score(X_Train,Y_Train)
1.0
randomforest = RandomForestRegressor(n_estimators=500,min_samples_split=10)
randomforest.fit(X_Train,Y_Train)
p_train = rf.predict(X_Train)
p_test = rf.predict(X_Test)
train_acc = r2_score(Y_Train, p_train)
test_acc = r2_score(Y_Test, p_test)
print("Training Score: ", train_acc)
print("Test Score: ", test_score)
Training Score: 0.9184766137141158 Test Score: 0.9863946408914626