Snippets for Data Science

  • Update : 2020.08.30

pd.drop()

# drop column
df = df.drop("col1", axis=1)
df = df.drop(["col2", "col3"], axis=1)
df.drop(["col4", "col5"], axis=1, inplace=True)

df.fillna()

df = df.fillna(0)

pd.concat()

df = pd.concat([df1, df2, ...])

pd.merge()

merged_df = pd.merge(df1, df2, left_on='df1_key', right_on='df2_key', how='inner')

df.iterrows()

for idx, row in df.iterrows():

df.apply()

def apply_function(df_each_row):
        return row_of_new_col

df["new_col"] = df.apply(apply_function, axis=1)

df.reset_index()

df.reset_index(drop=True, inplace=True)

line plot

plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4]) # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

with legend

plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], label="train")
plt.plot([2,3,4,5], [1,2,3,4], label="test")
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

beautiful legend

  • legend under the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.13),fancybox=True, shadow=False, ncol=5)

with marker

  • Not dot plot
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], label="train", marker=".")
plt.plot([2,3,4,5], [1,2,3,4], label="test", marker=".")
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

dot plot

plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], ".") # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

small dot plot

plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], ",") # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

save image

plt.savefig(file.png, dpi=100, bbox_inches='tight')

Matplotlib Font Setting

import matplotlib

font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 12}

matplotlib.rc('font', **font)

save DataFrame as Pickle

import pickle

row_dict = {"a" : 1}

log_file = os.path.join(".", ".", "file_name") + ".pkl"
if os.path.exists(log_file):
        with open(log_file, 'rb') as file:
        prev_df = pickle.load(file)
        new_df = pd.concat([pd.DataFrame([row_dict]), prev_df]).reset_index(drop=True)
else:
        new_df = pd.DataFrame([row_dict])

with open(log_file, 'wb') as f:
        pickle.dump(new_df, f)

PCA

from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(df)

df_pca = pca.transform(df)
df1_pca = pca.transform(df1)

plt.figure(figsize=(10,10))
plt.plot(df_pca[:, 0], df_pca[:, 1], label="df")
plt.plot(df1_pca[:, 0], df1_pca[:, 1], label="df1")
plt.title("PCA")
plt.legend()
plt.show()

Show all DataFrame

pd.options.display.max_columns = None
pd.options.display.max_rows = None

#or

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)