# modules we'll use import pandas as pd import numpy as np temp_data = pd.read_csv("C:tmpDaily_Lowest_Temp_Seoul_Missing.csv") # 데이터를 무작위로 섞거나 임의의 수를 발생시키기 위한 seed 생성 np.random.seed(0) temp_data.sample(5) # get the number of missing data points per column missing_values_count = temp_data.isnull().sum() # look at the # of missing points in the first two columns missing_values_count[0:2] # how many total missing values do we have? total_cells = np.product(temp_data.shape) total_missing = missing_values_count.sum() # percent of data that is missing (total_missing/total_cells) * 100 # remove all the rows that contain a missing value temp_data.dropna() # remove all columns with at least one missing value columns_with_na_dropped = temp_data.dropna(axis=1) columns_with_na_dropped.head() # just how much data did we lose? print("Columns in original dataset: %d n" % temp_data.shape[1]) print("Columns with na's dropped: %d" % columns_with_na_dropped.shape[1]) # get a small subset of the Temp dataset subset_temp_data = temp_data.loc[:, '관측일자':'최저기온(℃)'].head() subset_temp_data # replace all NA's with 0 subset_temp_data.fillna(0) # replace all NA's the value that comes directly after it in the same column, # then replace all the reamining na's with 0 subset_temp_data.fillna(method = 'bfill', axis=0).fillna(0) |