Feature Engineering for Machine Learning in Python

CHAPTER 1:
import pandas library import pandas as pd
load csv file using read_csv function of pandas
df = pd.read_csv(path_to_csv_file)
print first 5 rows
print(df.head())

print columns of the dataframe print(df.columns)
print column datatypes of the dataframe print(df.dtypes)
select integer datatype columns only_ints = df.select_dtypes(include=['int'])
print(only_ints.columns)

convert category column into label columns pd.get_dummies(df, columns=['Country'],prefix='C')

pd.get_dummies(df, columns=['Country'],drop_first=True, prefix='C')
counting the value labels counts = df['Country'].value_counts()
print(counts)

creating mask for labels that have less than 5 count as others mask = df['Country'].isin(counts[counts < 5].index)
df['Country'][mask] = 'Other'
print(pd.value_counts(colors))



df['Binary_Violation'] = 0
df.loc[df['Number_of_Violations'] > 0,'Binary_Violation'] = 1


import numpy as np
df['Binned_Group'] = pd.cut(
df['Number_of_Violations'],bins=[-np.inf, 0, 2, np.inf],
labels=[1, 2, 3]) 


CHAPTER 2:

print(df.info())
checking for null print(df.isnull())
check the count of null value in a specific column print(df['StackOverflowJobsRecommend'].isnull().sum())
printing notnull print(df.notnull())

# Drop all rows with at least one missing values
df.dropna(how='any')


# Drop rows with missing values in a specific column
df.dropna(subset=['VersionControl'])


# Replace missing values in a specific column
# with a given string
df['VersionControl'].fillna(
value='None Given', inplace=True
)


# Record where the values are not missing
df['SalaryGiven'] = df['ConvertedSalary'].notnull()
# Drop a specific column
df.drop(columns=['ConvertedSalary'])

checking mean median of the specific column print(df['ConvertedSalary'].mean())
print(df['ConvertedSalary'].median())







replacing nan values with mean of the specific column
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(df['ConvertedSalary'].mean())







converting column datatype as int64 using astype function

df['ConvertedSalary'] = df['ConvertedSalary'].astype('int64')



replacing nan values with rounded values of mean
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(round(df['ConvertedSalary'].mean()))




then checking the datatype of rawsalary column
print(df['RawSalary'].dtype)

printing the rawsalary column's first 5 rows
print(df['RawSalary'].head())


replacing the "," with "" using replace function
df['RawSalary'] = df['RawSalary'].str.replace(',', '')
converting its datatype as float using astype function
df['RawSalary'] = df['RawSalary'].astype('float')


converting to numerical values coerced_vals = pd.to_numeric(df['RawSalary'],errors='coerce')
print(df[coerced_vals.isna()].head())


in python we can apply following function calls using chain methon
df['column_name'] = df['column_name'].method1()
df['column_name'] = df['column_name'].method2()
df['column_name'] = df['column_name'].method3()

df['column_name'] = df['column_name'].method1().method2().method3()


CHAPTER 3:
impoert matplotlib library for ploting import matplotlib as plt
draw histogram of the dataframe
df.hist()

show the graph
plt.show()



ploting boxplot of the 'column_1' feature of the dataframe 
df[['column_1']].boxplot()
plt.show()



importing seaborn for plotting
import seaborn as sns
plotting pairplot of the dataframe
sns.pairplot(df)

 df.describe()



importing minmaxscaler from sklearn's preprocessing module
from sklearn.preprocessing import MinMaxScaler


initializing minmaxscaler

scaler = MinMaxScaler()
fiting the minmax scaler using age data of the dataframe
scaler.fit(df[['Age']])

transforming the data into a new column of dataframe
df['normalized_age'] = scaler.transform(df[['Age']])


from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[['Age']])
df['standardized_col'] = scaler.transform(df[['Age']])


from sklearn.preprocessing import PowerTransformer
log = PowerTransformer()
log.fit(df[['ConvertedSalary']])
df['log_ConvertedSalary'] =
log.transform(df[['ConvertedSalary']])


q_cutoff = df['col_name'].quantile(0.95)
mask = df['col_name'] < q_cutoff
trimmed_df = df[mask]


mean = df['col_name'].mean()
std = df['col_name'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_df = df[(df['col_name'] < upper) &
(df['col_name'] > lower)]


scaler = StandardScaler()
scaler.fit(train[['col']])
train['scaled_col'] = scaler.transform(train[['col']])
# FIT SOME MODEL
# ....
test = pd.read_csv('test_csv')
test['scaled_col'] = scaler.transform(test[['col']])


train_mean = train[['col']].mean()
train_std = train[['col']].std()
cut_off = train_std * 3
train_lower = train_mean - cut_off
train_upper = train_mean + cut_off
# Subset train data
test = pd.read_csv('test_csv')
# Subset test data
test = test[(test[['col']] < train_upper) &(test[['col']] > train_lower)]


CHAPTER 4::

print(speech_df.head())

speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')

speech_df['text'] = speech_df['text'].str.lower()
print(speech_df['text'][0])


speech_df['char_cnt'] = speech_df['text'].str.len()
print(speech_df['char_cnt'].head())


speech_df['word_cnt'] =speech_df['text'].str.split()
speech_df['word_cnt'].head(1)


speech_df['word_counts'] =speech_df['text'].str.split().str.len()
print(speech_df['word_splits'].head())


speech_df['avg_word_len'] =speech_df['char_cnt'] / speech_df['word_cnt']

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
print(cv)


from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0.1, max_df=0.9)
min_df : minimum fraction of documents the word must occur in
max_df : maximum fraction of documents the word can occur in


cv.fit(speech_df['text_clean'])

cv_transformed = cv.transform(speech_df['text_clean'])
print(cv_transformed)


cv_transformed.toarray()

feature_names = cv.get_feature_names()
print(feature_names)


cv_transformed = cv.fit_transform(speech_df['text_clean'])
print(cv_transformed)


cv_df = pd.DataFrame(cv_transformed.toarray(),
columns=cv.get_feature_names()).add_prefix('Counts_')
print(cv_df.head())


speech_df = pd.concat([speech_df, cv_df],axis=1, sort=False)
print(speech_df.shape)


print(speech_df['Counts_the'].head())

from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
print(tv)


tv = TfidfVectorizer(max_features=100,
stop_words='english')
max_features : Maximum number of columns created from TF-IDF
stop_words : List of common words to omit e.g. "and", "the" etc.


tv.fit(train_speech_df['text'])
train_tv_transformed = tv.transform(train_speech_df['text']


train_tv_df = pd.DataFrame(train_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
train_speech_df = pd.concat([train_speech_df, train_tv_df],
axis=1, sort=False)


examine_row = train_tv_df.iloc[0]
print(examine_row.sort_values(ascending=False))


test_tv_transformed = tv.transform(test_df['text_clean'])
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
test_speech_df = pd.concat([test_speech_df, test_tv_df],
axis=1, sort=False)


tv_bi_gram_vec = TfidfVectorizer(ngram_range = (2,2))
# Fit and apply bigram vectorizer
tv_bi_gram = tv_bi_gram_vec\
.fit_transform(speech_df['text'])
# Print the bigram features
print(tv_bi_gram_vec.get_feature_names())


# Create a DataFrame with the Counts features
tv_df = pd.DataFrame(tv_bi_gram.toarray(),
columns=tv_bi_gram_vec.get_feature_names())\
.add_prefix('Counts_')
tv_sums = tv_df.sum()
print(tv_sums.head())


print(tv_sums.sort_values(ascending=False)).head()


Comments

Popular posts from this blog