다른 명령
R 공시 크롤링
로그인 처리
전반적인 흐름은
- 로그인하고 웹 페이지로 이동하여 정보를 수집
- 데이터 프레임을 추가 한 다음 다음 페이지로 이동
library(rvest) #Address of the login webpage login<-"https://stackoverflow.com/users/login?ssrc=head&returnurl=http%3a%2f%2fstackoverflow.com%2f" #create a web session with the desired login address pgsession<-html_session(login) pgform<-html_form(pgsession)[[2]] #in this case the submit is the 2nd form filled_form<-set_values(pgform, email="*****", password="*****") submit_form(pgsession, filled_form) #pre allocate the final results dataframe. results<-data.frame() #loop through all of the pages with the desired info # 이 경우 루프는 5 페이지로 제한되며 응용 프로그램에 맞게 변경해야합니다. for (i in 1:5) { #base address of the pages to extract information from url<-"http://stackoverflow.com/users/**********?tab=answers&sort=activity&page=" url<-paste0(url, i) page<-jump_to(pgsession, url) #collect info on the question votes and question title summary<-html_nodes(page, "div .answer-summary") question<-matrix(html_text(html_nodes(summary, "div"), trim=TRUE), ncol=2, byrow = TRUE) #find date answered, hyperlink and whether it was accepted dateans<-html_node(summary, "span") %>% html_attr("title") hyperlink<-html_node(summary, "div a") %>% html_attr("href") accepted<-html_node(summary, "div") %>% html_attr("class") #create temp results then bind to final results rtemp<-cbind(question, dateans, accepted, hyperlink) results<-rbind(results, rtemp) } #Dataframe Clean-up names(results)<-c("Votes", "Answer", "Date", "Accepted", "HyperLink") results$Votes<-as.integer(as.character(results$Votes)) results$Accepted<-ifelse(results$Accepted=="answer-votes default", 0, 1)