Feature Engineering for Machine Learning in Python
CHAPTER 1:
import pandas library import pandas as pd
load csv file using read_csv function of pandas
df = pd.read_csv(path_to_csv_file)
print first 5 rows
print(df.head())
print columns of the dataframe print(df.columns)
print column datatypes of the dataframe print(df.dtypes)
select integer datatype columns only_ints = df.select_dtypes(include=['int'])
print(only_ints.columns)
convert category column into label columns pd.get_dummies(df, columns=['Country'],prefix='C')
pd.get_dummies(df, columns=['Country'],drop_first=True, prefix='C')
counting the value labels counts = df['Country'].value_counts()
print(counts)
creating mask for labels that have less than 5 count as others mask = df['Country'].isin(counts[counts < 5].index)
df['Country'][mask] = 'Other'
print(pd.value_counts(colors))
df['Binary_Violation'] = 0
df.loc[df['Number_of_Violations'] > 0,'Binary_Violation'] = 1
import numpy as np
df['Binned_Group'] = pd.cut(
df['Number_of_Violations'],bins=[-np.inf, 0, 2, np.inf],
labels=[1, 2, 3])
CHAPTER 2:
print(df.info())
checking for null print(df.isnull())
check the count of null value in a specific column print(df['StackOverflowJobsRecommend'].isnull().sum())
printing notnull print(df.notnull())
# Drop all rows with at least one missing values
df.dropna(how='any')
# Drop rows with missing values in a specific column
df.dropna(subset=['VersionControl'])
# Replace missing values in a specific column
# with a given string
df['VersionControl'].fillna(
value='None Given', inplace=True
)
# Record where the values are not missing
df['SalaryGiven'] = df['ConvertedSalary'].notnull()
# Drop a specific column
df.drop(columns=['ConvertedSalary'])
checking mean median of the specific column print(df['ConvertedSalary'].mean())
print(df['ConvertedSalary'].median())
replacing nan values with mean of the specific column
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(df['ConvertedSalary'].mean())
converting column datatype as int64 using astype function
df['ConvertedSalary'] = df['ConvertedSalary'].astype('int64')
replacing nan values with rounded values of mean
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(round(df['ConvertedSalary'].mean()))
then checking the datatype of rawsalary column
print(df['RawSalary'].dtype)
printing the rawsalary column's first 5 rows
print(df['RawSalary'].head())
replacing the "," with "" using replace function
df['RawSalary'] = df['RawSalary'].str.replace(',', '')
converting its datatype as float using astype function
df['RawSalary'] = df['RawSalary'].astype('float')
converting to numerical values coerced_vals = pd.to_numeric(df['RawSalary'],errors='coerce')
print(df[coerced_vals.isna()].head())
in python we can apply following function calls using chain methon
df['column_name'] = df['column_name'].method1()
df['column_name'] = df['column_name'].method2()
df['column_name'] = df['column_name'].method3()
df['column_name'] = df['column_name'].method1().method2().method3()
CHAPTER 3:
impoert matplotlib library for ploting import matplotlib as plt
draw histogram of the dataframe
df.hist()
show the graph
plt.show()
ploting boxplot of the 'column_1' feature of the dataframe
df[['column_1']].boxplot()
plt.show()
importing seaborn for plotting
import seaborn as sns
plotting pairplot of the dataframe
sns.pairplot(df)
df.describe()
importing minmaxscaler from sklearn's preprocessing module
from sklearn.preprocessing import MinMaxScaler
initializing minmaxscaler
scaler = MinMaxScaler()
fiting the minmax scaler using age data of the dataframe
scaler.fit(df[['Age']])
transforming the data into a new column of dataframe
df['normalized_age'] = scaler.transform(df[['Age']])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[['Age']])
df['standardized_col'] = scaler.transform(df[['Age']])
from sklearn.preprocessing import PowerTransformer
log = PowerTransformer()
log.fit(df[['ConvertedSalary']])
df['log_ConvertedSalary'] =
log.transform(df[['ConvertedSalary']])
q_cutoff = df['col_name'].quantile(0.95)
mask = df['col_name'] < q_cutoff
trimmed_df = df[mask]
mean = df['col_name'].mean()
std = df['col_name'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_df = df[(df['col_name'] < upper) &
(df['col_name'] > lower)]
scaler = StandardScaler()
scaler.fit(train[['col']])
train['scaled_col'] = scaler.transform(train[['col']])
# FIT SOME MODEL
# ....
test = pd.read_csv('test_csv')
test['scaled_col'] = scaler.transform(test[['col']])
train_mean = train[['col']].mean()
train_std = train[['col']].std()
cut_off = train_std * 3
train_lower = train_mean - cut_off
train_upper = train_mean + cut_off
# Subset train data
test = pd.read_csv('test_csv')
# Subset test data
test = test[(test[['col']] < train_upper) &(test[['col']] > train_lower)]
CHAPTER 4::
print(speech_df.head())
speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')
speech_df['text'] = speech_df['text'].str.lower()
print(speech_df['text'][0])
speech_df['char_cnt'] = speech_df['text'].str.len()
print(speech_df['char_cnt'].head())
speech_df['word_cnt'] =speech_df['text'].str.split()
speech_df['word_cnt'].head(1)
speech_df['word_counts'] =speech_df['text'].str.split().str.len()
print(speech_df['word_splits'].head())
speech_df['avg_word_len'] =speech_df['char_cnt'] / speech_df['word_cnt']
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
print(cv)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0.1, max_df=0.9)
min_df : minimum fraction of documents the word must occur in
max_df : maximum fraction of documents the word can occur in
cv.fit(speech_df['text_clean'])
cv_transformed = cv.transform(speech_df['text_clean'])
print(cv_transformed)
cv_transformed.toarray()
feature_names = cv.get_feature_names()
print(feature_names)
cv_transformed = cv.fit_transform(speech_df['text_clean'])
print(cv_transformed)
cv_df = pd.DataFrame(cv_transformed.toarray(),
columns=cv.get_feature_names()).add_prefix('Counts_')
print(cv_df.head())
speech_df = pd.concat([speech_df, cv_df],axis=1, sort=False)
print(speech_df.shape)
print(speech_df['Counts_the'].head())
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
print(tv)
tv = TfidfVectorizer(max_features=100,
stop_words='english')
max_features : Maximum number of columns created from TF-IDF
stop_words : List of common words to omit e.g. "and", "the" etc.
tv.fit(train_speech_df['text'])
train_tv_transformed = tv.transform(train_speech_df['text']
train_tv_df = pd.DataFrame(train_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
train_speech_df = pd.concat([train_speech_df, train_tv_df],
axis=1, sort=False)
examine_row = train_tv_df.iloc[0]
print(examine_row.sort_values(ascending=False))
test_tv_transformed = tv.transform(test_df['text_clean'])
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
test_speech_df = pd.concat([test_speech_df, test_tv_df],
axis=1, sort=False)
tv_bi_gram_vec = TfidfVectorizer(ngram_range = (2,2))
# Fit and apply bigram vectorizer
tv_bi_gram = tv_bi_gram_vec\
.fit_transform(speech_df['text'])
# Print the bigram features
print(tv_bi_gram_vec.get_feature_names())
# Create a DataFrame with the Counts features
tv_df = pd.DataFrame(tv_bi_gram.toarray(),
columns=tv_bi_gram_vec.get_feature_names())\
.add_prefix('Counts_')
tv_sums = tv_df.sum()
print(tv_sums.head())
print(tv_sums.sort_values(ascending=False)).head()
import pandas library import pandas as pd
load csv file using read_csv function of pandas
df = pd.read_csv(path_to_csv_file)
print first 5 rows
print(df.head())
print columns of the dataframe print(df.columns)
print column datatypes of the dataframe print(df.dtypes)
select integer datatype columns only_ints = df.select_dtypes(include=['int'])
print(only_ints.columns)
convert category column into label columns pd.get_dummies(df, columns=['Country'],prefix='C')
pd.get_dummies(df, columns=['Country'],drop_first=True, prefix='C')
counting the value labels counts = df['Country'].value_counts()
print(counts)
creating mask for labels that have less than 5 count as others mask = df['Country'].isin(counts[counts < 5].index)
df['Country'][mask] = 'Other'
print(pd.value_counts(colors))
df['Binary_Violation'] = 0
df.loc[df['Number_of_Violations'] > 0,'Binary_Violation'] = 1
import numpy as np
df['Binned_Group'] = pd.cut(
df['Number_of_Violations'],bins=[-np.inf, 0, 2, np.inf],
labels=[1, 2, 3])
CHAPTER 2:
print(df.info())
checking for null print(df.isnull())
check the count of null value in a specific column print(df['StackOverflowJobsRecommend'].isnull().sum())
printing notnull print(df.notnull())
# Drop all rows with at least one missing values
df.dropna(how='any')
# Drop rows with missing values in a specific column
df.dropna(subset=['VersionControl'])
# Replace missing values in a specific column
# with a given string
df['VersionControl'].fillna(
value='None Given', inplace=True
)
# Record where the values are not missing
df['SalaryGiven'] = df['ConvertedSalary'].notnull()
# Drop a specific column
df.drop(columns=['ConvertedSalary'])
checking mean median of the specific column print(df['ConvertedSalary'].mean())
print(df['ConvertedSalary'].median())
replacing nan values with mean of the specific column
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(df['ConvertedSalary'].mean())
converting column datatype as int64 using astype function
df['ConvertedSalary'] = df['ConvertedSalary'].astype('int64')
replacing nan values with rounded values of mean
df['ConvertedSalary'] = df['ConvertedSalary'].fillna(round(df['ConvertedSalary'].mean()))
then checking the datatype of rawsalary column
print(df['RawSalary'].dtype)
printing the rawsalary column's first 5 rows
print(df['RawSalary'].head())
replacing the "," with "" using replace function
df['RawSalary'] = df['RawSalary'].str.replace(',', '')
converting its datatype as float using astype function
df['RawSalary'] = df['RawSalary'].astype('float')
converting to numerical values coerced_vals = pd.to_numeric(df['RawSalary'],errors='coerce')
print(df[coerced_vals.isna()].head())
in python we can apply following function calls using chain methon
df['column_name'] = df['column_name'].method1()
df['column_name'] = df['column_name'].method2()
df['column_name'] = df['column_name'].method3()
df['column_name'] = df['column_name'].method1().method2().method3()
CHAPTER 3:
impoert matplotlib library for ploting import matplotlib as plt
draw histogram of the dataframe
df.hist()
show the graph
plt.show()
ploting boxplot of the 'column_1' feature of the dataframe
df[['column_1']].boxplot()
plt.show()
importing seaborn for plotting
import seaborn as sns
plotting pairplot of the dataframe
sns.pairplot(df)
df.describe()
importing minmaxscaler from sklearn's preprocessing module
from sklearn.preprocessing import MinMaxScaler
initializing minmaxscaler
scaler = MinMaxScaler()
fiting the minmax scaler using age data of the dataframe
scaler.fit(df[['Age']])
transforming the data into a new column of dataframe
df['normalized_age'] = scaler.transform(df[['Age']])
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df[['Age']])
df['standardized_col'] = scaler.transform(df[['Age']])
from sklearn.preprocessing import PowerTransformer
log = PowerTransformer()
log.fit(df[['ConvertedSalary']])
df['log_ConvertedSalary'] =
log.transform(df[['ConvertedSalary']])
q_cutoff = df['col_name'].quantile(0.95)
mask = df['col_name'] < q_cutoff
trimmed_df = df[mask]
mean = df['col_name'].mean()
std = df['col_name'].std()
cut_off = std * 3
lower, upper = mean - cut_off, mean + cut_off
new_df = df[(df['col_name'] < upper) &
(df['col_name'] > lower)]
scaler = StandardScaler()
scaler.fit(train[['col']])
train['scaled_col'] = scaler.transform(train[['col']])
# FIT SOME MODEL
# ....
test = pd.read_csv('test_csv')
test['scaled_col'] = scaler.transform(test[['col']])
train_mean = train[['col']].mean()
train_std = train[['col']].std()
cut_off = train_std * 3
train_lower = train_mean - cut_off
train_upper = train_mean + cut_off
# Subset train data
test = pd.read_csv('test_csv')
# Subset test data
test = test[(test[['col']] < train_upper) &(test[['col']] > train_lower)]
CHAPTER 4::
print(speech_df.head())
speech_df['text'] = speech_df['text'].str.replace('[^a-zA-Z]', ' ')
speech_df['text'] = speech_df['text'].str.lower()
print(speech_df['text'][0])
speech_df['char_cnt'] = speech_df['text'].str.len()
print(speech_df['char_cnt'].head())
speech_df['word_cnt'] =speech_df['text'].str.split()
speech_df['word_cnt'].head(1)
speech_df['word_counts'] =speech_df['text'].str.split().str.len()
print(speech_df['word_splits'].head())
speech_df['avg_word_len'] =speech_df['char_cnt'] / speech_df['word_cnt']
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
print(cv)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=0.1, max_df=0.9)
min_df : minimum fraction of documents the word must occur in
max_df : maximum fraction of documents the word can occur in
cv.fit(speech_df['text_clean'])
cv_transformed = cv.transform(speech_df['text_clean'])
print(cv_transformed)
cv_transformed.toarray()
feature_names = cv.get_feature_names()
print(feature_names)
cv_transformed = cv.fit_transform(speech_df['text_clean'])
print(cv_transformed)
cv_df = pd.DataFrame(cv_transformed.toarray(),
columns=cv.get_feature_names()).add_prefix('Counts_')
print(cv_df.head())
speech_df = pd.concat([speech_df, cv_df],axis=1, sort=False)
print(speech_df.shape)
print(speech_df['Counts_the'].head())
from sklearn.feature_extraction.text import TfidfVectorizer
tv = TfidfVectorizer()
print(tv)
tv = TfidfVectorizer(max_features=100,
stop_words='english')
max_features : Maximum number of columns created from TF-IDF
stop_words : List of common words to omit e.g. "and", "the" etc.
tv.fit(train_speech_df['text'])
train_tv_transformed = tv.transform(train_speech_df['text']
train_tv_df = pd.DataFrame(train_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
train_speech_df = pd.concat([train_speech_df, train_tv_df],
axis=1, sort=False)
examine_row = train_tv_df.iloc[0]
print(examine_row.sort_values(ascending=False))
test_tv_transformed = tv.transform(test_df['text_clean'])
test_tv_df = pd.DataFrame(test_tv_transformed.toarray(),
columns=tv.get_feature_names())\
.add_prefix('TFIDF_')
test_speech_df = pd.concat([test_speech_df, test_tv_df],
axis=1, sort=False)
tv_bi_gram_vec = TfidfVectorizer(ngram_range = (2,2))
# Fit and apply bigram vectorizer
tv_bi_gram = tv_bi_gram_vec\
.fit_transform(speech_df['text'])
# Print the bigram features
print(tv_bi_gram_vec.get_feature_names())
# Create a DataFrame with the Counts features
tv_df = pd.DataFrame(tv_bi_gram.toarray(),
columns=tv_bi_gram_vec.get_feature_names())\
.add_prefix('Counts_')
tv_sums = tv_df.sum()
print(tv_sums.head())
print(tv_sums.sort_values(ascending=False)).head()
Comments
Post a Comment