Pandas - Series, Dataframe - CRUD / loc,iloc / data filtering

pandas는 시리즈와 데이터 프레임이라는 구조화된 형식을 제공

1. Series

시리즈는 데이터가 순차적으로 나열된 일차원 배열의 형태
인덱스와 value가 일대일 대응 관계
딕셔너리로 시리즈 생성. 키는 시리즈 인덱스와 대응하고 값은 시리즈의 데이터 값으로 변환됨

import pandas as pd

#series = 1st dimension data
#create
seriesdata=pd.Series([70,60,90])
seriesdata = pd.Series([70,60,90], index = ['국어','영어','수학']) #인덱스는 지정하지않으면 0부터 자동생성. 지정된 경우 지정된 인덱스 사용. 행의 레이블을 뜻함
#create using dict type. 키는 시리즈 인덱스와 대응하고 값은 시리즈의 데이터 값으로 변환됨
dict_data={"a":1,"b":2,"c":3}
series_data = pd.Series(dict_data)
#create using list type.리스트로 작성을 하면 인덱스로 변환할 값이 없음. 디폴트 인덱스가 생성되며 정수형 위치 인덱스가 지정됨
list_data = ["2022-10-11", 3.14, "ABC",100,True]
series_data = pd.Series(list_data)

#read and update
seriesdata.index = ['art','music','p.e.']
seriesdata['art']=80
print(seriesdata['art'], seriesdata.iloc[0]) #both values are the same thing

#delete
del seriesdata['art']

#pandas datatype: str=object, int=int64, float=float64, bool=bool +) datetime64, timedelta[ns](difference b/w two datetime64s)
#change the datatype
seriesdata.astype('float')

#series_연산
#간단하게 s1 + s2 이런식으로도 연산이 됨. 간단히 s1 > 2 이런식으로 넣을수도 있음.
#스칼라와의 연산도 가볍게 됨. s1 + 1 하면 모든 요소에 1이 더해짐. df도 동일하게 적용.

2. Dataframe

행과 열로 만들어지는 이차원 배열의 형태
데이터 프레임의 열은 각각의 시리즈 객체

#dataframe = second dimensiond data(table) - mostly used for data analysis/ml - has two sets of indexes
#create - you can create one with or w/o index
df=pd.Dataframe({
    'USA':[2.1,2.2,2.3],
    'ROK':[0.4,0.5,0.6],
    'CHINA':[10,13,15],
    index =[2000,2010,2020]
})
#Or another example of dataframe
df1 = pd.DataFrame({
    'id': [1, 2, 3],
    'customer_id': [1, 2, 3],
    'customer_name': ['Robert', 'Peter', 'Dave']
}, columns=['id', 'customer_id', 'customer_name'])
#more example of creating
dict_data = {"c0":[1,2,3], "c1":[4,5,6]}
df = pd.DataFrame(dict_data)

#read and update - index, columns, values(series는 index, values)

#여러 df 보기
display(df, df_1)

#find
.head() / .tail() #상단/ 하단 5개 행 출력(괄호안에 숫자를 넣으면 숫자만큼 행 출력)

#row index
df.index

#컬럼명확인
df.columns

#looking up the values
df.values

#unique값 뭐뭐있는지 확인
df['col_name'].unique()

#setting one column to become the index
df=pd.Dataframe({
    'year':[2000,2010,2020]
    'USA':[2.1,2.2,2.3],
    'ROK':[0.4,0.5,0.6],
    'CHINA':[10,13,15],
})
df = df.set_index('year')

#checking up the index and update
df.index.name
df.index.name='연도'

#index_reset
df=df.reset_index('연도') #인덱스를 열로 변환하고, 열을 유지 (열 삭제=> df.reset_index(drop=True)
<=>
df = df.set_index('연도')

#행추가
pd.concat([df, new_df])
#행제거
df.drop(index_name, axis=0)
#열추가
df[new_column_name] = new_value
#열제거
df.drop(drop_col_name, axis=1)
df.drop(columns="col_name")  #한 행
df.drop(columns=['col_name', 'col_name2'])  #여러행
df.drop(columns='rankOver3', inplace = True)
df.drop([1,2,4])  #1,2,4행 지움(0이 시작행)


#행중복제거
df.drop_duplicates()

#change_type
df['release_year'] = df['release_year'].astype('Int64')
df1 = df.astype({'col1':'변경후타입'}) #한개 열만 변경
df1 = df.astype({'col1':'int32', 'col3':'int64'}) #다수 열 변경
df1= df.astype(dtype='int64') #모든 열 변경
df1= df.astype(dtype='int64',errors='ignore') #바로 위 줄 실행시 에러 발생 하는 경우, ignore하면 에러발생하지 않는 열만 수정됨


#change index
#몇개 바꿀때
df.rename({인덱스:바꿀 인덱스, 인덱스:바꿀 인덱스})
#전체 바꿀때
df.index = 바꿀 인덱스 리스트

#change col name
#열이름 하나 변경
df.rename({열이름:바꿀이름, 열이름:바꿀이름, ...}, axis=1)  #or
df.rename(columns={열이름:바꿀이름, 열이름:바꿀이름, ...})
#열이름 전체변경
df.columns = 열 이름 리스트

.loc / .iloc (가능하면 .loc를 사용(행번호/열번호는 변경될 수 있으므로)) #인덱싱 방법임!

#calling the data
df.loc[2000] #find the value through the index #[행조건,열조건]
df.loc[3,] df.loc[:,'Name']   #한쪽만 조회
df.loc[3:5,['Age','Pclass']]  df.loc[[1,3,5],['Age','Pclass']] #다양한 방법으로 조회
df.loc['row_name', 'column_name']  #시리즈로 출력
df.loc[['row_name', 'row_name'], ['column_name', 'column_name']] #df로 리턴
name35 = titanic.loc[titanic["Age"]>35, ["Name", "Age"]] #예시

df.iloc[0] #위치인덱스를 사용하여 조회 #[행인덱스조건,열인덱스조건](행번호,열번호)
#가능하면 .loc를 사용함(행번호/열번호는 변경될 수 있으므로)

#bringup the row
df.loc[2000]
df.iloc[0]

#find the column
df['USA']  #열1개도 [[]] 이중대괄호로 추출하면 데이터프레임이 됨
df.USA
print(df['USA'][2000])
print(df.loc[2000]['USA'])
df2 = [['CHINA','ROK']] #bring up multiple columns
df.loc[df.index != '2월'] #특정행만 제외하고 가져오기 #열로바꾸면 df.loc[:, df.columns != '매출액']
df.loc['1월':'4월'] #연속행 가져오기 #열로바꾸면 df.loc[:, '매출액':'순이익']
df.loc[[True, True, False, False, True]] #인덱스 개수와 동일한 true/false 배열을 지정해 true 인 행만 가져오기
#열로바꾸면 df.loc[:, [False, True, True]]

#update(add)
df['Vietnam'] = [1,2,3] #열추가, df[new_col_name] = new_value 
df.loc[new_row_name] = new_value #행추가
df.loc[2021] = [1,2,3] #[2021] 부분은 it has to be set as an index prior to using this function

name25.iloc[[1,2,3],0] = "No name" #1-3행까지의 0번째 열의 값을 "No name"으로 변경


#delete
del df['Vietnam']
df.drop([2000]) #행삭제, df.drop(index,axis=0)
df.drop(col_name, axis=1) #열삭제

#copy - leave the original data alone, and process the copied version. Use this when you need to bring up the necessary columns when analyzing
df2 = [['CHINA','ROK']].copy()

#methods
.sort_values(정렬할 기준이 되는 컬럼명, ascending=False) #df.sort_values('Age')
df.sort_values(['Age','Fare'], ascending=[False,True]) #여러개 열 기준
   #인덱스 sorting: .sort_index(axis=0, level=None, ascending=False), 필수인 인자는 없음

df[조건식]  df.query('조건식') #특정 조건을 충족하는 데이터 추출하기 
#df[df['Pclass'] == 1]  df.query('Pclass == 1')  #둘중 취사선택하면 됨. 동일한 결과가 나옴 
df[(df['Pclass'] == 1) | (df['Age'] >= 30)] #and/or
df.query('Pclass == 1 and Age >= 30') #and/or

#sampling
import random
passengerid_list = list(df['PassengerId'])
passengerid_sample = random.sample(passengerid_list, 10)
df[df['PassengerId'].isin(passengerid_sample)] #df.query('PassengerId in @passengerid_sample')

dataframe 안에서 데이터 찾기(데이터 필터링)

#.isin(): 각각의 요소가 데이터 프레임 또는 시리즈에 존재하는지 파악해서 True/False 값 반환
word_list=['a','b','c']
df[df['column_name'].isin(word_list)]

# titanic(df)에서 plcass열에서 1의 값을 가지고 있는 경우만 True로 인식
titanic["PClass"].isin([1])


.str[:3] #숫자만큼 필요한 문자열을 뽑아올 수 있음
df['col_name'].str.extract('(string)') #col_name 열에서 string이 들어간 곳은 string추출, 나머지는 NaN반환
#.str.contains()
df[df['column_name'].str.contains('strings', na=False, case=False)]
# 특정 문자열 이 들어가 있는 행을 데이터 프레임에서 뽑아내기
#na=False: 값이 na일 경우 false로 처리
#case=False: 대소문자 구분 없음

#불리언 인덱싱: True 값을 가진 행만 추출
#df passenger에서 age행에서 age가 35살 이상인 사람만 True로 처리해서 뽑기
# 결과물은 age>35에 해당하는 사람의 모든 정보(모든 열)가 나옴
#변수 = df[true로 처리할 값]
above35 = passenger[passenger["Age"]>35]

#불리언인덱싱 + .isin(): 데이터의 특정 범위만 추출
class_1 = titanic[titanic["PClass"].isin([1])]

age2040 = passenger[passenger["Age"].isin(np.arrange(20,41))]

#.str: series 에서만 작용, 예, df['열이름'].str.replace() 이렇게만 쓸 수 있음
#.str.contains(x): 문자열 포함 여부
df[df['Name'].str.contains('Mrs')] #df에서 name열에 Mrs가 들어가는 데이터의 전체행 보이기
#.str.replace(x,y)
df2['Name'] = df2['Name'].str.replace(',', '')
#.str.lower()    .str.upper()
#.str.split(x,expand=True/False,n=개수)  / expand는 false가 디폴트, true넣으면 열을 여러개로 나눠줌, n은 더 만들어줄 열 개수
     #df.str.split(' ')에서 한 결과가 [Dooley, Mr.,Patrick]으로 나왔다면
df.str.split(' ', expand=True, n=1)   #여기서 expand하지않고 일부 정보만가져오기 df.str.split(' ').str[0]

'Python_Wiki > Python_Syntax' 카테고리의 다른 글

(순서)자료형(data type) - 튜플 (0)	2025.05.27
(순서)자료형(data type) - 문자열 Strings (0)	2025.05.27
(비순서)자료형(data type) - 집합, 부울 (0)	2025.05.27
(순서)자료형(data type) - 리스트 (0)	2025.05.27
(순서)자료형(data type) - 숫자 Numbers (0)	2025.05.27

사업기획 6년차 June의 데이터 블로그

Pandas - Series, Dataframe - CRUD / loc,iloc / data filtering

1. Series

2. Dataframe

'Python_Wiki > Python_Syntax' 카테고리의 다른 글

티스토리툴바

« 2026/07 »
일	월	화	수	목	금	토
			1	2	3	4
5	6	7	8	9	10	11
12	13	14	15	16	17	18
19	20	21	22	23	24	25
26	27	28	29	30	31

Pandas - Series, Dataframe - CRUD / loc,iloc / data filtering

1. Series

2. Dataframe

'Python_Wiki > Python_Syntax' 카테고리의 다른 글

관련글

티스토리툴바