Snippets for Data Science
- Update : 2020.08.30
pd.drop()
# drop column
df = df.drop("col1", axis=1)
df = df.drop(["col2", "col3"], axis=1)
df.drop(["col4", "col5"], axis=1, inplace=True)
df.fillna()
df = df.fillna(0)
pd.concat()
df = pd.concat([df1, df2, ...])
pd.merge()
merged_df = pd.merge(df1, df2, left_on='df1_key', right_on='df2_key', how='inner')
df.iterrows()
for idx, row in df.iterrows():
df.apply()
def apply_function(df_each_row):
return row_of_new_col
df["new_col"] = df.apply(apply_function, axis=1)
df.reset_index()
df.reset_index(drop=True, inplace=True)
line plot
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4]) # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
with legend
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], label="train")
plt.plot([2,3,4,5], [1,2,3,4], label="test")
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()
beautiful legend
- legend under the plot
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.13),fancybox=True, shadow=False, ncol=5)
with marker
- Not dot plot
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], label="train", marker=".")
plt.plot([2,3,4,5], [1,2,3,4], label="test", marker=".")
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()
dot plot
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], ".") # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
small dot plot
plt.figure(figsize=(10,10))
plt.plot([1, 4, 9, 16], [1,2,3,4], ",") # x: [1,4,9,16] y: [1,2,3,4]
plt.title("title")
plt.xlabel("x")
plt.ylabel("y")
plt.show()
save image
plt.savefig(file.png, dpi=100, bbox_inches='tight')
Matplotlib Font Setting
import matplotlib
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 12}
matplotlib.rc('font', **font)
save DataFrame as Pickle
import pickle
row_dict = {"a" : 1}
log_file = os.path.join(".", ".", "file_name") + ".pkl"
if os.path.exists(log_file):
with open(log_file, 'rb') as file:
prev_df = pickle.load(file)
new_df = pd.concat([pd.DataFrame([row_dict]), prev_df]).reset_index(drop=True)
else:
new_df = pd.DataFrame([row_dict])
with open(log_file, 'wb') as f:
pickle.dump(new_df, f)
PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(df)
df_pca = pca.transform(df)
df1_pca = pca.transform(df1)
plt.figure(figsize=(10,10))
plt.plot(df_pca[:, 0], df_pca[:, 1], label="df")
plt.plot(df1_pca[:, 0], df1_pca[:, 1], label="df1")
plt.title("PCA")
plt.legend()
plt.show()
Show all DataFrame
pd.options.display.max_columns = None
pd.options.display.max_rows = None
#or
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)