data analysis & visualization

보호되어 있는 글입니다.
내용을 보시려면 비밀번호를 입력하세요.

확인

크롤링 환경을 자동으로 사용하는데 있어서 매번 환경 구축해주는 것이 귀찮은 일이다.

이에 따라 terminal의 code를 R 스크립트에서 실행하는 코드를 찾아보았다.

rstudioapi 패키지가 그 기능을 하고 있었다.

 

install.packages('rstudioapi')
library(rstudioapi)

 

먼저 terminal을 생성하고 

 

myTerm <- rstudioapi::terminalCreate()

 

실행될 때까지 기다려준다.

 

if (!rstudioapi::terminalRunning(myTerm)) {
  # start the terminal shell back up, but don't bring to front
  rstudioapi::terminalActivate(myTerm, show = FALSE)
  # wait for it to start
  while (!rstudioapi::terminalRunning(myTerm)) {
    Sys.sleep(0.1)
  }

 

실행이 되면 terminalSend 함수를 활용하여 terminal에 commend를 입력해보자.

 

rstudioapi::terminalSend(myTerm, "cd c:/selenium\n
java -Dwebdriver.gecko.driver=\"geckodriver.exe\" -jar selenium-server-standalone-3.9.1.jar -port 4445\n") 



먼저 크롬을 열고 네이버데이터랩으로 이동!

remDr$open()
url='https://datalab.naver.com/keyword/realtimeList.naver?where=main'
remDr$navigate(url)

source를 가져온 후 text로 변환


html <- remDr$getPageSource()[[1]] 
html <- read_html(html)
sWords <- html %>% html_nodes("div.rank_inner") %>% html_text()

str_split을 통해 분리
sWords=str_split(sWords,'\n')

n=length(sWords)

단어를 클랜징
sWords2=gsub(' ','',sWords[[i]])
sWords2=sWords2[nchar(sWords2)!=0]

제목 추출
title=sWords2[1]
sWords2=sWords2[-1]

df에 데이터프래임 생성

(옵션에 stringsAsFactors=F를 주면,마지막에 정렬해주지 않아도 된다.)

df=data.frame(rank=sWords2[seq(1,length(sWords2),2)],sWords2[seq(2,length(sWords2),2)])
colnames(df)=c('rank',title)
data=df

반복


for(i in 2:n){
sWords2=gsub(' ','',sWords[[i]])
sWords2=sWords2[nchar(sWords2)!=0]
title=sWords2[1]
sWords2=sWords2[-1]
df=data.frame(rank=sWords2[seq(1,length(sWords2),2)],sWords2[seq(2,length(sWords2),2)])
colnames(df)=c('rank',title)
data=merge(data,df,by='rank')
}

data$rank=as.numeric(data$rank)
data[order(data$rank),]
data


remDr$close()

 

전체 코드

remDr$open()
url='https://datalab.naver.com/keyword/realtimeList.naver?where=main'
remDr$navigate(url)

html <- remDr$getPageSource()[[1]] 
html <- read_html(html)
sWords <- html %>% html_nodes("div.rank_inner") %>% html_text()
sWords=str_split(sWords,'\n')

n=length(sWords)

sWords2=gsub(' ','',sWords[[i]])
sWords2=sWords2[nchar(sWords2)!=0]
title=sWords2[1]
sWords2=sWords2[-1]
df=data.frame(rank=sWords2[seq(1,length(sWords2),2)],sWords2[seq(2,length(sWords2),2)])
colnames(df)=c('rank',title)
data=df

for(i in 2:n){
sWords2=gsub(' ','',sWords[[i]])
sWords2=sWords2[nchar(sWords2)!=0]
title=sWords2[1]
sWords2=sWords2[-1]
df=data.frame(rank=sWords2[seq(1,length(sWords2),2)],sWords2[seq(2,length(sWords2),2)])
colnames(df)=c('rank',title)
data=merge(data,df,by='rank')
}
data$rank=as.numeric(data$rank)
data[order(data$rank),]
data

'R > crawling' 카테고리의 다른 글

동네예보 최종  (0) 2019.08.07
terminal code R을 이용해 실행하기  (0) 2019.07.31
기상청 자료 다운로드  (0) 2019.05.22
XML package를 활용한 정적 크롤링  (0) 2019.04.15
PlotGoogleMaps 사용해 AWS, ASOS 위치 나타내기  (0) 2019.04.11

library(devtools)

install_github('qkdrk777777/kma2')

library(kma2)

library(RSelenium)

library(stringr)


pack2(c("rvest", "httr", "stringr", "RCurl", "XML", "progress"))


setwd('Y:/data/asos')

dir='Y:/data/asos'

if(sum(list.files()%in%'kma')==0)dir.create('kma')


element=function(var,css,type='click',messages=NULL,using='css selector'){

  assign(var,T,envir=.GlobalEnv)

  suppressMessages({

    try(silent = T,{

      while(get(var,envir = .GlobalEnv)==T){

        tryCatch({assign(var,remDr$findElement(using=using,css),envir = .GlobalEnv)

          if(type=='click'){get(var)$clickElement()

          }else if(type=='sendKeys'){

            get(var)$clearElement()

            get(var)$sendKeysToElement(messages)

          }

        },error=function(e)assign(var,T,envir = .GlobalEnv))

        Sys.sleep(.5)

      }

    })

  })

}



#다운로드 경로 설정-----


open=function(var='remDr',dir='C:/Users/OWNER/Desktop/kma',port=4447L,browser='chrome'){

  setwd(dir)

  if(sum(list.files()%in%'delete')==0)dir.create('delete')

  eCaps <<- list(chromeOptions = list(prefs = list(profile.default_content_settings.popups = 4447L, 

                                                   download.prompt_for_download = FALSE,

                                                   download.default_directory = paste0(dir,'/delete'))))

  assign(var,remoteDriver(port=port,browserName=browser,extraCapabilities = eCaps),envir = .GlobalEnv)

  remDr$open()}

open(port=4445L,dir='Y:/data/asos')


remDr$navigate("https://data.kma.go.kr/data/rmt/rmtList.do?code=420&pgmNo=572")

element(var='remDr2',css='a#loginBtn')



#로그인 -----

element(var='id_',css="input#loginId.input-medium",type='sendKeys',messages = list('qkdrk777777@naver.com'))

element(var='pw_',css="input#passwordNo.input-medium",type='sendKeys',messages = list('whckdwp1!@'))

element(var='login',css='//*[@id=\"loginbtn\"]',using='xpath')

# element(var='login',css='//*[@id=\"loginbtn\"]',using='xpath',type='sendKeys',messages = list(key='enter'))


###자료 타입----

type='forcast'

select_type=function(type){

  asos='https://data.kma.go.kr/data/grnd/selectAsosRltmList.do?pgmNo=36'

  aws='https://data.kma.go.kr/data/grnd/selectAwsRltmList.do?pgmNo=56'

  #Agricultural weather observation(농업기상관측)

  AWO='https://data.kma.go.kr/data/grnd/selectAgrRltmList.do?pgmNo=72'

  #North Korea weather observation(북한기상관측)

  nkw='https://data.kma.go.kr/data/grnd/selectNkRltmList.do?pgmNo=58'

  forcast="https://data.kma.go.kr/data/rmt/rmtList.do?code=420&pgmNo=572"

  remDr$navigate(get(type))}

select_type(type='asos')

select_type(type=type)


###자료 기간타입-----

timeType='hour'

select_timeType=function(timeType){

  hour='F00502';day='F00501';mon='F00513';year='F00512';min='F00503'

  element(var='option1',css=paste0("//*/option[@value ='",get(timeType),"']"),using='xpath')

}

# select_timeType(timeType='mon')

# select_timeType(timeType=timeType)

# select_timeType(timeType='hour')

####-----

select_priod=function(type=NULL,timeType,start=as.POSIXct('2019-05-09 18:00'),end=as.POSIXct('2019-05-10 18:00')){

  

  if(timeType=='hour'){

    #시간자료-------

    ##시작하는 기간

    # start=as.POSIXct('2019-05-09 18:00')

    element(var='st',css='input#startDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),1,4),"']"))

    year[[3]]$clickElement()

    # Sys.sleep(1)

    # mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",0,"']"))

    # mon[[1]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(start)),6,7)))-1,"']"))

    mon[[1]]$clickElement()

    # Sys.sleep(1)

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(as.character(start)),9,10))]]$clickElement()

    

    

    ###시간

    #여는 코드 없어도 되서 생략

    # stTime=remDr$findElement('css selector','select#startHh.select')

    # stTime$clickElement()

    element(var='stTimes',using = 'xpath'

            ,css=paste0("//*/option[@value ='",substr(start,12,13),"']"))

    

    ##끝나는 기간

    # end=as.POSIXct('2019-05-10 18:00')

    element(var='ed',css='input#endDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),1,4),"']"))

    year[[3]]$clickElement()

    # Sys.sleep(1)

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(end)),6,7)))-1,"']"))

    mon[[5]]$clickElement()

    # Sys.sleep(1)

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(end),9,10))]]$clickElement()

    Sys.sleep(1)

    

    

    ###시간

    edTimes=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(end,12,13),"']"))

    edTimes[[2]]$clickElement()

    

  }else if(timeType=='day'){

    #일자료--------

    ##시작하는 기간

    # start=as.POSIXct('2019-05-09 18:00')

    element(var='st',css='input#startDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),1,4),"']"))

    year[[3]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(start)),6,7)))-1,"']"))

    mon[[1]]$clickElement()

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(as.character(start)),9,10))]]$clickElement()

    Sys.sleep(2)

    

    

    ##끝나는 기간

    # end=as.POSIXct('2019-05-11 18:00')

    element(var='ed',css='input#endDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),1,4),"']"))

    year[[3]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(end)),6,7)))-1,"']"))

    mon[[5]]$clickElement()

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(as.character(end)),9,10))]]$clickElement()

    Sys.sleep(2)

    

  }else if(timeType=='mon'){

    #월자료----------

    ##시작하는 기간

    #start=as.POSIXct('2018-04-09 18:00')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),1,4),"']"))

    year[[1]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),6,7),"']"))

    

    if(type=='forcast'){mon[[1]]$clickElement()} else  mon[[3]]$clickElement()

    

    ##끝나는 기간

    #end=as.POSIXct('2019-04-11 18:00')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),1,4),"']"))

    year[[2]]$clickElement()

    

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),6,7),"']"))

    if(type=='forcast')mon[[2]]$clickElement() else   mon[[4]]$clickElement()  

  }else if(timeType=='year'){

    #년자료-------

    ##시작하는 기간

    # start=as.POSIXct('2017-01-11 18:00')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),1,4),"']"))

    year[[1]]$clickElement()

    ##끝나는 기간

    # end=as.POSIXct('2015-04-11 18:00')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),1,4),"']"))

    year[[2]]$clickElement()

    

  }else if(timeType=='min'){

    #분자료------

    ##시작하는 기간

    # start=as.POSIXct('2017-01-11 18:00')

    element(var='st',css='input#startDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(start)),1,4),"']"))

    year[[3]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(start)),6,7)))-1,"']"))

    mon[[1]]$clickElement()

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(as.character(start)),9,10))]]$clickElement()

    Sys.sleep(2)

    

    ##끝나는 기간

    # end=as.POSIXct('2017-05-11 18:00')

    element(var='ed',css='input#endDt.input-medium.inline.hasDatepicker')

    year=remDr$findElements('xpath',value=paste0("//*/option[@value ='",substr(as.Date(as.character(end)),1,4),"']"))

    year[[3]]$clickElement()

    mon=remDr$findElements('xpath',value=paste0("//*/option[@value ='",as.numeric(gsub('-','',substr(as.Date(as.character(end)),6,7)))-1,"']"))

    mon[[5]]$clickElement()

    days=remDr$findElements('css',value='a.ui-state-default')

    days[[as.numeric(substr(as.Date(as.character(end)),9,10))]]$clickElement()

    Sys.sleep(2)

    

  }

}


start=as.POSIXct('2018-01-01 00:01')

end  =as.POSIXct('2018-12-31 23:00')


# select_priod(timeType='mon',start,end,type='forcast')

select_priod(timeType='hour',start,end,type=NULL)

#변수 전체 선택 ------

select_var=function(type=NULL){

  if(type=='forcast'){

    element(var='variable',css='input#btnStn.selectBtn1.btn.btn-primary.VAR1_BTN')

  }else{

    element(var='variable',css='input#gubun.selectBtn2.btn.btn-primary')}

  

  element(var='variable1',css='span#ztree_1_check.button.chk.checkbox_false_full')

  element(var='variable_enter',css='li.btn-sitetree-complete')

}

# select_var(type='forcast')

select_var(type='asos')

select_area=function(type=NULL){

  #지점 전체 선택-----

  #지점별로는 나중에하고 일딴 전체 조회로 했음

  ##kma2 데이터에 있는 지점 인코딩 변환

  # names(citydata)=iconv(names(citydata),'cp949','UTF-8')

  # names(citydata2)=iconv(names(citydata2),'cp949','UTF-8')

  # for(i in 1:length(citydata)){

  #   names(citydata[[i]])=iconv(names(citydata[[i]]),'cp949','UTF-8')}

  # city_index=which(names(citydata)%in%'서울특별시')

  if(type=='forcast'){

    element(var='area',css='input#btnStn.selectBtn1.btn.btn-primary.VAR3_BTN')

  }else {

    element(var='area',css='input#btnStn1.selectBtn1.btn.btn-primary')}

  element(var='area1',css='span#ztree_1_check.button.chk.checkbox_false_full')

  element(var='area_enter',css='li.btn-sitetree-complete')

}

# select_area(type='forcast')

select_area(type=type)

#조회 -----

select_search=function(){

  element(var='n_list',css=paste0("//*/option[@value ='",'100',"']"),using='xpath')

  element(var='search',css='a.addBtn.btn-img-detail')

}

select_search()

#데이터 갯수-----

n=as.numeric(gsub('[ㄱ-힣]| ','',remDr$findElement('css selector','span.float-left')$getElementText()[[1]]))

remDr$screenshot(display = T)

# type='asos'

if(type!='forcast'){

  for(pageNum in 1){

    # Sys.sleep(.5)

    if(!paste0(type,'_',timeType,'_',gsub('-','',gsub(' ','hour',substr(as.character(start),1,13)))

               ,'to',gsub('-','',gsub(' ','hour',substr(as.character(end),1,13))),'(',pageNum,').csv')%in%list.files())

      message(round(pageNum/ceiling(n/100)*100,3),'%')  

    element(var='download',css='a.btn.btn-default')

    # element(var='use',css='input#reqstPurposeCd')

    element(var='use',css='input#reqstPurposeCd7')

    

    # use<<-remDr$findElement('xpath','input#reqstPurposeCd7')

    # while(length(use)==0){

    # use<<-remDr$findElement('css selector','input#reqstPurposeCd7')

    # }

    # use$clickElement()

    Sys.sleep(1)

    remDr$executeScript(script="fnRltmRequest();",args=1:2)

    while(length(list.files(paste0(dir,'/delete'),pattern = 'csv$'))==0){

      Sys.sleep(1)}

    fileName=list.files('delete',pattern='csv$')

    copy=file.copy(

      paste0(dir,'/delete/',

             list.files('delete',pattern='csv$')),

      paste0(dir,'/',

             list.files('delete',pattern='csv$')))

    if(copy==T){

      remove=file.remove(paste0(dir,'/delete/',

                                list.files('delete',pattern='csv$')))

      if(remove!=T){break}

    }else break

    file.rename(fileName,

                paste0(type,'_',timeType,'_',gsub('-','',gsub(' ','hour',substr(as.character(start),1,13)))

                       ,'to',gsub('-','',gsub(' ','hour',substr(as.character(end),1,13))),'(',pageNum,').csv'))

    

    remDr$executeScript(script = paste0('goPage(',pageNum,'); return false;'),args=1:2)

    Sys.sleep(2)

  }

}


if(type=='forcast'){

  k=0

  kk=0

  while(T){

    list=data.frame(na.omit(readHTMLTable(remDr$getPageSource()[[1]])[[2]][,1:3]))

    list2=paste0(gsub(' |/|,|-|>','', paste0(list[,1],list[,2],list[,3])),'.csv')

    down = remDr$findElements(using = "css selector", 

                              value = "input.btn.btn-default.DATA_DOWN_BTN")

    for(i in 1:length(down)){

      kk=kk+1

      message(round(kk/n*100,3),'%')

      try(silent = T,{

        down[[i]]$clickElement()

      })    

      if(i==1){Sys.sleep(3)

        try(silent = T,{

          a=remDr$findElements('css selector','ul.check-list input')

          a[[8]]$clickElement()

          close2 = remDr$findElements(using = "css selector", 

                                      value = "input.btn.btn-primary")

          close2[[4]]$clickElement()

        })

      }

      

      while(length(list.files('delete'))==0){

        suppressMessages({

          try(silent = T,{

            try(silent = T,{

              error=remDr$findElement('css selector','button.buttonOK')

              error$clickElement()

            })

            

            down <<- remDr$findElements(using = "css selector", 

                                        value = "input.btn.btn-default.DATA_DOWN_BTN")

            down[[i]]$clickElement()

            if(length(list.files('delete'))==1){

              file.remove(paste0(dir,'/delete/',

                                 list.files('delete',pattern='csv$')))        

            }

          })})

      }

      Sys.sleep(1)

      fileName=list.files('delete',pattern='csv$')

      file.copy(paste0(dir,'/delete/',list.files('delete',pattern='csv$')),

                paste0(dir,'/',list.files('delete',pattern='csv$')))

      file.remove(paste0(dir,'/delete/',

                         list.files('delete',pattern='csv$')))

      file.rename(fileName,list2[i])

      

    }

    k=k+1

    element('page',using='class name','next_page')

    if(k>ceiling(n/100))break

  }

  

}


RPubs - url을 통한 크롤링(정적 크롤링만 가능)


RPubs - plotgooglemaps를 통한 동적 지도 그림그리기