# iterating
?mean# custom function
<- function (numbers, trim_ratio) {
custom_mean # process the data
<- mean(x = numbers, trim = trim_ratio, na.rm = TRUE)
my_mean # save the it as file
saveRDS(my_mean, file = paste0(numbers[1], trim_ratio, ".rds"))
}
# use the function
custom_mean(c(1,5,NA), 0.2)
# iterate
<- list(c(1,5,NA),
list_to_iterate_on c(1,4,6,8),
c(1,2,7,9,NA))
<- c(0.2, 0.7, 0.1)
trim_ratios
library(purrr)
# alternative to apply functions or for loops
map2(list_to_iterate_on, trim_ratios, custom_mean)
# pmap(): construct a dataframe of all combinations,
# with each column containing the argument values to use
# construct the dataframe of all combinations
<- c("magpie", "pelican", "ibis")
animal <- c("dry food", "sludge", "grain")
treatment # only three rows
<- data.frame(animal, treatment)
experiment # all combinations
library(tidyr)
<- expand(experiment, animal, treatment) all_combinations
UQRUG 28
meeting
2022-06-29: UQRUG 28
Attendees
- Stéphane: Library | here to help and say goodbye
- Chris: Civil Engineering - Transport | just tagging along
- Luke: Library | here to help and say hello
- Olalekan Biological Sciences | here to say hello…
Topics discussed and code
Iterating instead of using repetitive code
The trick here is to:
- Encapsulate the repetitive code into a function, exposing the things that are likely to change as arguments
- Create a vector of values (or several)
- Use a for loop, or an
apply()
function, or amap()
function (from purrr) to map the function to each element
List files
# listing files
list.files() # all files in working directory
<- list.files(pattern = "rds")
only_rds <- list.files("analysis", "rds")
only_rds
only_rds# full path (from working directory)
<- list.files("analysis", "rds", full.names = TRUE)
only_rds only_rds
Remove file extension from path
# remove file extension from path
library(stringr)
str_replace("filename.txt", ".txt", "")
Extract information from filenames
Using the tidyverse and pdftools for preparing PDF text before analysis with quanteda. pdftools was used for its specific ability to return the page numbers of the pdfs.
library(pdftools)
library(tidyverse)
# create a list of the PDF file paths
<- list.files(path = "./pdfs", pattern = "*.pdf", all.files = FALSE,
myfiles full.names = TRUE, recursive = TRUE,
ignore.case = FALSE, include.dirs = TRUE, no.. = FALSE)
# Function to import each pdf file, and place the text in a dataframe
<- function(k){
import_pdf # turn the pdf into a text list each page will become a row
<- pdftools::pdf_text(k)
pdf.text # flatten the list
<-unlist(pdf.text)
pdf.text<- data_frame(pdf.text)
pdfdf # turn the list into a dataframe, extracting the year from the path and using the separate function to extract the state from the path
data_frame(pdf.text) %>%
mutate(year = str_extract(k, "[:digit:]{4}") %>% as.integer(), pagenumber = row.names(pdfdf), filename = k) %>%
separate(filename, c(NA,NA,"state",NA), sep = "/")
}# run the function on all pdf files
<- map_dfr(myfiles, import_pdf) all_pdfs
See the quanteda tutorials