Try to build an analysis workflow yourself!
Implementing Sykdomspulsen Core requires a number of functions to be called in the correct order. To make this as simple as possible, we have provided a skeleton implementation at https://github.com/folkehelseinstituttet/scskeleton
You should clone this GitHub repo (https://github.com/folkehelseinstituttet/scskeleton) to your server. This will be the package that you will be working on throughout this tutorial. You may choose to do a global find/replace on scskeleton
with the name you want for your R package. We will refer to this R package as your “sc implementation”.
You should also clone https://github.com/folkehelseinstituttet/scexample to your server. This is the end product of the tutorial, and you should refer to it in order to check your work.
For the purposes of this tutorial, we assume that the reader is either using RStudio Server Open Source or RStudio Workbench inside Docker containers that have been built according to the Sykdomspulsen specifications. We will refer to your implementation of RStudio Server Open Source/RStudio Workbench with the generic term “RStudio”.
You will be creating an analysis workflow where you download two datasets from SSB (https://data.ssb.no/api/v0/dataset/49678.csv?lang=en and https://data.ssb.no/api/v0/dataset/25138.csv?lang=en), clean them, import then, and analyze them.
Create two schemas.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/03_db_schemas.r#L166-L168
166 | name_access = c("anon"),
167 | name_grouping = "example_house_prices",
168 | name_variant = NULL,
With variables household_income_median_all_households_nok
, household_income_median_singles_nok
, household_income_median_couples_without_children_nok
, household_income_median_couples_with_children_nok
, household_income_median_single_with_children_nok
.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/03_db_schemas.r#L116-L118
116 | name_access = c("anon"),
117 | name_grouping = "example_income",
118 | name_variant = NULL,
With variables new_house_price_per_m2_nok
, used_house_price_per_m2_nok
.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/03_db_schemas.r#L114-L209
114 | ## > anon_example_income ----
115 | sc::add_schema_v8(
116 | name_access = c("anon"),
117 | name_grouping = "example_income",
118 | name_variant = NULL,
119 | db_configs = sc::config$db_configs,
120 | field_types = c(
121 | "granularity_time" = "TEXT",
122 | "granularity_geo" = "TEXT",
123 | "country_iso3" = "TEXT",
124 | "location_code" = "TEXT",
125 | "border" = "INTEGER",
126 | "age" = "TEXT",
127 | "sex" = "TEXT",
128 |
129 | "date" = "DATE",
130 |
131 | "isoyear" = "INTEGER",
132 | "isoweek" = "INTEGER",
133 | "isoyearweek" = "TEXT",
134 | "season" = "TEXT",
135 | "seasonweek" = "DOUBLE",
136 |
137 | "calyear" = "INTEGER",
138 | "calmonth" = "INTEGER",
139 | "calyearmonth" = "TEXT",
140 |
141 | "household_income_median_all_households_nok" = "DOUBLE",
142 | "household_income_median_singles_nok" = "DOUBLE",
143 | "household_income_median_couples_without_children_nok" = "DOUBLE",
144 | "household_income_median_couples_with_children_nok" = "DOUBLE",
145 | "household_income_median_single_with_children_nok" = "DOUBLE"
146 | ),
147 | keys = c(
148 | "granularity_time",
149 | "location_code",
150 | "date",
151 | "age",
152 | "sex"
153 | ),
154 | censors = list(
155 | anon = list(
156 |
157 | )
158 | ),
159 | validator_field_types = sc::validator_field_types_sykdomspulsen,
160 | validator_field_contents = sc::validator_field_contents_sykdomspulsen,
161 | info = "This db table is used for..."
162 | )
163 |
164 | ## > anon_example_house_prices ----
165 | sc::add_schema_v8(
166 | name_access = c("anon"),
167 | name_grouping = "example_house_prices",
168 | name_variant = NULL,
169 | db_configs = sc::config$db_configs,
170 | field_types = c(
171 | "granularity_time" = "TEXT",
172 | "granularity_geo" = "TEXT",
173 | "country_iso3" = "TEXT",
174 | "location_code" = "TEXT",
175 | "border" = "INTEGER",
176 | "age" = "TEXT",
177 | "sex" = "TEXT",
178 |
179 | "date" = "DATE",
180 |
181 | "isoyear" = "INTEGER",
182 | "isoweek" = "INTEGER",
183 | "isoyearweek" = "TEXT",
184 | "season" = "TEXT",
185 | "seasonweek" = "DOUBLE",
186 |
187 | "calyear" = "INTEGER",
188 | "calmonth" = "INTEGER",
189 | "calyearmonth" = "TEXT",
190 |
191 | "new_house_price_per_m2_nok" = "DOUBLE",
192 | "used_house_price_per_m2_nok" = "DOUBLE"
193 | ),
194 | keys = c(
195 | "granularity_time",
196 | "location_code",
197 | "date",
198 | "age",
199 | "sex"
200 | ),
201 | censors = list(
202 | anon = list(
203 |
204 | )
205 | ),
206 | validator_field_types = sc::validator_field_types_sykdomspulsen,
207 | validator_field_contents = sc::validator_field_contents_sykdomspulsen,
208 | info = "This db table is used for..."
209 | )
Create a task definition.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/04_tasks.r#L105-L107
105 | name_grouping = "household_incomes_and_house_prices",
106 | name_action = "import_data",
107 | name_variant = NULL,
You will need only 1 analysis. The aim of this task is to download the data from SSB and import it into the two schemas previously specified.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/04_tasks.r#L102-L127
102 | ## > household_incomes_and_house_prices_import_data ----
103 | # tm_run_task("household_incomes_and_house_prices_import_data")
104 | sc::add_task_from_config_v8(
105 | name_grouping = "household_incomes_and_house_prices",
106 | name_action = "import_data",
107 | name_variant = NULL,
108 | cores = 1,
109 | plan_analysis_fn_name = NULL,
110 | for_each_plan = plnr::expand_list(
111 | x = 1
112 | ),
113 | for_each_analysis = NULL,
114 | universal_argset = NULL,
115 | upsert_at_end_of_each_plan = FALSE,
116 | insert_at_end_of_each_plan = FALSE,
117 | action_fn_name = "scexample::household_incomes_and_house_prices_import_data_action",
118 | data_selector_fn_name = "scexample::household_incomes_and_house_prices_import_data_data_selector",
119 | schema = list(
120 | # input
121 |
122 | # output
123 | "anon_example_income" = sc::config$schemas$anon_example_income,
124 | "anon_example_house_prices" = sc::config$schemas$anon_example_house_prices
125 | ),
126 | info = "This task downloads and cleans the raw data and aggregates it to county and national level"
127 | )
Create a data_selector_fn that downloads the two data sets from SSB (https://data.ssb.no/api/v0/dataset/49678.csv?lang=en and https://data.ssb.no/api/v0/dataset/25138.csv?lang=en)
https://github.com/folkehelseinstituttet/scexample/blob/main/R/household_incomes_and_house_prices_import_data.r#L309-L382
309 | # **** data_selector **** ----
310 | #' household_incomes_and_house_prices_import_data (data selector)
311 | #' @param argset Argset
312 | #' @param schema DB Schema
313 | #' @export
314 | household_incomes_and_house_prices_import_data_data_selector = function(argset, schema){
315 | if(plnr::is_run_directly()){
316 | # sc::tm_get_plans_argsets_as_dt("household_incomes_and_house_prices_import_data")
317 |
318 | index_plan <- 1
319 |
320 | argset <- sc::tm_get_argset("household_incomes_and_house_prices_import_data", index_plan = index_plan)
321 | schema <- sc::tm_get_schema("household_incomes_and_house_prices_import_data")
322 | }
323 |
324 | # household incomes
325 | # https://data.ssb.no/api/v0/dataset/49678.csv?lang=en
326 | d_income <- fread("https://data.ssb.no/api/v0/dataset/49678.csv?lang=en") |>
327 | dplyr::filter(contents=="Income after taxes, median (NOK)") |>
328 | tidyr::pivot_wider(
329 | id_cols = c(region, year),
330 | names_from = c(`type of household`),
331 | values_from = c(`06944: Households' income, by region, type of household, year and contents`)
332 | ) |>
333 | janitor::clean_names() |>
334 | dplyr::mutate(
335 | location_code = paste0("municip",stringr::str_extract(region, "^[0-9][0-9][0-9][0-9]")),
336 | household_income_median_all_households_nok = as.numeric(x0000_all_households),
337 | household_income_median_singles_nok = as.numeric(x0001_living_alone),
338 | household_income_median_couples_without_children_nok = as.numeric(x0002_couple_without_resident_children),
339 | household_income_median_couples_with_children_nok = as.numeric(x0003_couple_with_resident_children_0_17_year),
340 | household_income_median_single_with_children_nok = as.numeric(x0004_single_mother_father_with_children_0_17_year)
341 | ) |>
342 | dplyr::select(
343 | location_code,
344 | calyear = year,
345 | household_income_median_all_households_nok,
346 | household_income_median_singles_nok,
347 | household_income_median_couples_without_children_nok,
348 | household_income_median_couples_with_children_nok,
349 | household_income_median_single_with_children_nok
350 | ) |>
351 | data.table()
352 |
353 | # house prices
354 | # https://data.ssb.no/api/v0/dataset/25138.csv?lang=en
355 | d_price <- fread("https://data.ssb.no/api/v0/dataset/25138.csv?lang=en") |>
356 | dplyr::filter(contents=="Price per square meter (NOK)") |>
357 | tidyr::pivot_wider(
358 | id_cols = c(region, year),
359 | names_from = c(`type of detached houses`),
360 | values_from = c(`03364: Prices per square meter, by region, type of detached houses, year and contents`)
361 | ) |>
362 | janitor::clean_names() |>
363 | dplyr::mutate(
364 | location_code = paste0("county",stringr::str_extract(region, "^[0-9][0-9]")),
365 | new_house_price_per_m2_nok =as.numeric(x01_new_detached_houses),
366 | used_house_price_per_m2_nok = as.numeric(x02_used_detached_houses)
367 | ) |>
368 | dplyr::select(
369 | location_code,
370 | calyear = year,
371 | new_house_price_per_m2_nok,
372 | used_house_price_per_m2_nok
373 | ) |>
374 | data.table()
375 |
376 | # The variable returned must be a named list
377 | retval <- list(
378 | "income" = d_income,
379 | "price" = d_price
380 | )
381 | retval
382 | }
Create an action_fn that:
https://github.com/folkehelseinstituttet/scexample/blob/main/R/household_incomes_and_house_prices_import_data.r#L2-L307
2 | # **** action **** ----
3 | #' household_incomes_and_house_prices_import_data (action)
4 | #' @param data Data
5 | #' @param argset Argset
6 | #' @param schema DB Schema
7 | #' @export
8 | household_incomes_and_house_prices_import_data_action <- function(data, argset, schema) {
9 | # tm_run_task("household_incomes_and_house_prices_import_data")
10 |
11 | if(plnr::is_run_directly()){
12 | # sc::tm_get_plans_argsets_as_dt("household_incomes_and_house_prices_import_data")
13 |
14 | index_plan <- 1
15 | index_analysis <- 1
16 |
17 | data <- sc::tm_get_data("household_incomes_and_house_prices_import_data", index_plan = index_plan)
18 | argset <- sc::tm_get_argset("household_incomes_and_house_prices_import_data", index_plan = index_plan, index_analysis = index_analysis)
19 | schema <- sc::tm_get_schema("household_incomes_and_house_prices_import_data")
20 | }
21 |
22 | # code goes here
23 | # special case that runs before everything
24 | if(argset$first_analysis == TRUE){
25 |
26 | }
27 |
28 | # skeleton for income ----
29 | # 1. Create a variable (possibly a list) to hold the data
30 | d_agg <- list()
31 | d_agg$calyear_municip <- copy(data$income)
32 |
33 | # redistricting
34 | for(i in seq_along(d_agg)){
35 | d_agg[[i]] <- merge(
36 | d_agg[[i]],
37 | fhidata::norway_locations_redistricting()[,-"granularity_geo"],
38 | by.x = c("location_code", "calyear"),
39 | by.y = c("location_code_original", "calyear"),
40 | all.x = TRUE
41 | )
42 | }
43 |
44 | # 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed
45 | # this will also fix the redistricting/kommunesammenslaaing issues
46 | for(i in seq_along(d_agg)){
47 | d_agg[[i]] <- d_agg[[i]][,.(
48 | household_income_median_all_households_nok = mean(household_income_median_all_households_nok * weighting, na.rm=T),
49 | household_income_median_singles_nok = mean(household_income_median_singles_nok * weighting, na.rm=T),
50 | household_income_median_couples_without_children_nok = mean(household_income_median_couples_without_children_nok * weighting, na.rm=T),
51 | household_income_median_couples_with_children_nok = mean(household_income_median_couples_with_children_nok * weighting, na.rm=T),
52 | household_income_median_single_with_children_nok = mean(household_income_median_single_with_children_nok * weighting, na.rm=T)
53 | ), keyby=.(
54 | location_code = location_code_current,
55 | calyear
56 | )]
57 | }
58 |
59 | d_agg[]
60 |
61 | # 6. Pull out important dates
62 | calyear_min <- min(d_agg$calyear_municip$calyear)
63 | calyear_max <- max(d_agg$calyear_municip$calyear)
64 |
65 | # 7. Create `multiskeleton`
66 | # granularity_geo should have the following groups:
67 | # - nodata (when no data is available, and there is no "finer" data available to aggregate up)
68 | # - all levels of granularity_geo where you have data available
69 | # If you do not have data for a specific granularity_geo, but there is "finer" data available
70 | # then you should not include this granularity_geo in the multiskeleton, because you will create
71 | # it later when you aggregate up your data (baregion)
72 | multiskeleton_calyear <- fhidata::make_skeleton(
73 | calyear_min = calyear_min,
74 | calyear_max = calyear_max,
75 | granularity_geo = list(
76 | "nodata" = c(
77 | "wardoslo",
78 | "extrawardoslo",
79 | "missingwardoslo",
80 | "wardbergen",
81 | "missingwardbergen",
82 | "wardstavanger",
83 | "missingwardstavanger",
84 | "baregion",
85 | "notmainlandmunicip",
86 | "missingmunicip"
87 | ),
88 |
89 | "municip" = c(
90 | "municip"
91 | )
92 | )
93 | )
94 |
95 | # 8. Merge in the information you have at different geographical granularities
96 | # one level at a time
97 | # municip
98 | multiskeleton_calyear$municip[
99 | d_agg$calyear_municip,
100 | on = c("location_code", "calyear"),
101 | c(
102 | "household_income_median_all_households_nok",
103 | "household_income_median_singles_nok",
104 | "household_income_median_couples_without_children_nok",
105 | "household_income_median_couples_with_children_nok",
106 | "household_income_median_single_with_children_nok"
107 | ) := .(
108 | household_income_median_all_households_nok,
109 | household_income_median_singles_nok,
110 | household_income_median_couples_without_children_nok,
111 | household_income_median_couples_with_children_nok,
112 | household_income_median_single_with_children_nok
113 | )
114 | ]
115 |
116 | multiskeleton_calyear$municip[]
117 |
118 | # 9. Aggregate up to higher geographical granularities
119 | multiskeleton_calyear$county <- multiskeleton_calyear$municip[
120 | fhidata::norway_locations_hierarchy(
121 | from = "municip",
122 | to = "county"
123 | ),
124 | on = c(
125 | "location_code==from_code"
126 | )
127 | ][,
128 | .(
129 | household_income_median_all_households_nok = mean(household_income_median_all_households_nok, na.rm=T),
130 | household_income_median_singles_nok = mean(household_income_median_singles_nok, na.rm=T),
131 | household_income_median_couples_without_children_nok = mean(household_income_median_couples_without_children_nok, na.rm=T),
132 | household_income_median_couples_with_children_nok = mean(household_income_median_couples_with_children_nok, na.rm=T),
133 | household_income_median_single_with_children_nok = mean(household_income_median_single_with_children_nok, na.rm=T),
134 | granularity_geo = "county"
135 | ),
136 | by=.(
137 | granularity_time,
138 | calyear,
139 | location_code = to_code
140 | )
141 | ]
142 |
143 | multiskeleton_calyear$county[]
144 |
145 | #nation
146 | multiskeleton_calyear$nation <- multiskeleton_calyear$municip[
147 | ,
148 | .(
149 | household_income_median_all_households_nok = mean(household_income_median_all_households_nok, na.rm=T),
150 | household_income_median_singles_nok = mean(household_income_median_singles_nok, na.rm=T),
151 | household_income_median_couples_without_children_nok = mean(household_income_median_couples_without_children_nok, na.rm=T),
152 | household_income_median_couples_with_children_nok = mean(household_income_median_couples_with_children_nok, na.rm=T),
153 | household_income_median_single_with_children_nok = mean(household_income_median_single_with_children_nok, na.rm=T),
154 | granularity_geo = "nation",
155 | location_code = "norge"
156 | ),
157 | by=.(
158 | granularity_time,
159 | calyear
160 | )
161 | ]
162 |
163 | multiskeleton_calyear$nation[]
164 |
165 | # combine all the different granularity_geos
166 | skeleton_calyear <- rbindlist(multiskeleton_calyear, fill = TRUE, use.names = TRUE)
167 |
168 | skeleton_calyear[]
169 |
170 | # fix up missing structural data
171 | skeleton_calyear[, age := "total"]
172 | skeleton_calyear[, sex := "total"]
173 | sc::fill_in_missing_v8(skeleton_calyear, border = config$border)
174 |
175 | # put data in db table
176 | schema$anon_example_income$drop_all_rows_and_then_insert_data(skeleton_calyear)
177 |
178 | # check that it uploaded
179 | nrow(skeleton_calyear)
180 | schema$anon_example_income$tbl() |> dplyr::summarize(n()) |> dplyr::collect()
181 |
182 | # skeleton for prices ----
183 | # 1. Create a variable (possibly a list) to hold the data
184 | d_agg <- list()
185 | d_agg$calyear_county <- copy(data$price)
186 |
187 | # redistricting
188 | for(i in seq_along(d_agg)){
189 | d_agg[[i]] <- merge(
190 | d_agg[[i]],
191 | fhidata::norway_locations_redistricting()[,-"granularity_geo"],
192 | by.x = c("location_code", "calyear"),
193 | by.y = c("location_code_original", "calyear"),
194 | all.x = TRUE
195 | )
196 | }
197 |
198 | # 5. Re-aggregate your data to different geographical levels to ensure that duplicates have now been removed
199 | # this will also fix the redistricting/kommunesammenslaaing issues
200 | for(i in seq_along(d_agg)){
201 | d_agg[[i]] <- d_agg[[i]][,.(
202 | new_house_price_per_m2_nok = mean(new_house_price_per_m2_nok * weighting, na.rm=T),
203 | used_house_price_per_m2_nok = mean(used_house_price_per_m2_nok * weighting, na.rm=T)
204 | ), keyby=.(
205 | location_code = location_code_current,
206 | calyear
207 | )]
208 | }
209 |
210 | d_agg[]
211 |
212 | # 6. Pull out important dates
213 | calyear_min <- min(d_agg$calyear_county$calyear)
214 | calyear_max <- max(d_agg$calyear_county$calyear)
215 |
216 | # 7. Create `multiskeleton`
217 | # granularity_geo should have the following groups:
218 | # - nodata (when no data is available, and there is no "finer" data available to aggregate up)
219 | # - all levels of granularity_geo where you have data available
220 | # If you do not have data for a specific granularity_geo, but there is "finer" data available
221 | # then you should not include this granularity_geo in the multiskeleton, because you will create
222 | # it later when you aggregate up your data (baregion)
223 | multiskeleton_calyear <- fhidata::make_skeleton(
224 | calyear_min = calyear_min,
225 | calyear_max = calyear_max,
226 | granularity_geo = list(
227 | "nodata" = c(
228 | "wardoslo",
229 | "extrawardoslo",
230 | "missingwardoslo",
231 | "wardbergen",
232 | "missingwardbergen",
233 | "wardstavanger",
234 | "missingwardstavanger",
235 | "baregion",
236 | "notmainlandmunicip",
237 | "missingmunicip",
238 | "municip"
239 | ),
240 |
241 | "county" = c(
242 | "county"
243 | )
244 | )
245 | )
246 |
247 | # 8. Merge in the information you have at different geographical granularities
248 | # one level at a time
249 | # municip
250 | multiskeleton_calyear$county[
251 | d_agg$calyear_county,
252 | on = c("location_code", "calyear"),
253 | c(
254 | "new_house_price_per_m2_nok",
255 | "used_house_price_per_m2_nok"
256 | ) := .(
257 | new_house_price_per_m2_nok,
258 | used_house_price_per_m2_nok
259 | )
260 | ]
261 |
262 | multiskeleton_calyear$county[]
263 |
264 | # 9. Aggregate up to higher geographical granularities
265 | #nation
266 | multiskeleton_calyear$nation <- multiskeleton_calyear$county[
267 | ,
268 | .(
269 | new_house_price_per_m2_nok = mean(new_house_price_per_m2_nok, na.rm=T),
270 | used_house_price_per_m2_nok = mean(used_house_price_per_m2_nok, na.rm=T),
271 | granularity_geo = "nation",
272 | location_code = "norge"
273 | ),
274 | by=.(
275 | granularity_time,
276 | calyear
277 | )
278 | ]
279 |
280 | multiskeleton_calyear$nation[]
281 |
282 | # combine all the different granularity_geos
283 | skeleton_calyear <- rbindlist(multiskeleton_calyear, fill = TRUE, use.names = TRUE)
284 |
285 | skeleton_calyear[]
286 |
287 | # fix up missing structural data
288 | skeleton_calyear[, age := "total"]
289 | skeleton_calyear[, sex := "total"]
290 | sc::fill_in_missing_v8(skeleton_calyear, border = config$border)
291 |
292 | # put data in db table
293 | schema$anon_example_house_prices$drop_all_rows_and_then_insert_data(skeleton_calyear)
294 |
295 | # check that it uploaded
296 | nrow(skeleton_calyear)
297 | schema$anon_example_house_prices$tbl() |> dplyr::summarize(n()) |> dplyr::collect()
298 |
299 | # special case that runs after everything
300 | # copy to anon_web?
301 | if(argset$last_analysis == TRUE){
302 | # sc::copy_into_new_table_where(
303 | # table_from = "anon_X",
304 | # table_to = "anon_webkht"
305 | # )
306 | }
307 | }
Create one schema.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/03_db_schemas.r#L213-L215
213 | name_access = c("anon"),
214 | name_grouping = "example_house_prices",
215 | name_variant = "outliers_after_adjusting_for_income",
With variables household_income_median_all_households_nok
, new_house_price_per_m2_nok
, new_house_price_per_m2_baseline_nok
, new_house_price_per_m2_nok_predinterval_q02x5
, new_house_price_per_m2_nok_predinterval_q97x5
, new_house_price_per_m2_nok_status
.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/03_db_schemas.r#L212-L261
212 | sc::add_schema_v8(
213 | name_access = c("anon"),
214 | name_grouping = "example_house_prices",
215 | name_variant = "outliers_after_adjusting_for_income",
216 | db_configs = sc::config$db_configs,
217 | field_types = c(
218 | "granularity_time" = "TEXT",
219 | "granularity_geo" = "TEXT",
220 | "country_iso3" = "TEXT",
221 | "location_code" = "TEXT",
222 | "border" = "INTEGER",
223 | "age" = "TEXT",
224 | "sex" = "TEXT",
225 |
226 | "date" = "DATE",
227 |
228 | "isoyear" = "INTEGER",
229 | "isoweek" = "INTEGER",
230 | "isoyearweek" = "TEXT",
231 | "season" = "TEXT",
232 | "seasonweek" = "DOUBLE",
233 |
234 | "calyear" = "INTEGER",
235 | "calmonth" = "INTEGER",
236 | "calyearmonth" = "TEXT",
237 |
238 | "household_income_median_all_households_nok" = "DOUBLE",
239 |
240 | "new_house_price_per_m2_nok" = "DOUBLE",
241 | "new_house_price_per_m2_baseline_nok" = "DOUBLE",
242 | "new_house_price_per_m2_nok_predinterval_q02x5" = "DOUBLE",
243 | "new_house_price_per_m2_nok_predinterval_q97x5" = "DOUBLE",
244 | "new_house_price_per_m2_nok_status" = "TEXT"
245 | ),
246 | keys = c(
247 | "granularity_time",
248 | "location_code",
249 | "date",
250 | "age",
251 | "sex"
252 | ),
253 | censors = list(
254 | anon = list(
255 |
256 | )
257 | ),
258 | validator_field_types = sc::validator_field_types_sykdomspulsen,
259 | validator_field_contents = sc::validator_field_contents_sykdomspulsen,
260 | info = "This db table is used for..."
261 | )
Create a task definition.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/04_tasks.r#L132-L134
132 | name_grouping = "household_incomes_and_house_prices",
133 | name_action = "fit_model_and_find_outliers",
134 | name_variant = NULL,
You will need only 1 analysis. The aim of this task is to use the data from sc::config$schemas$anon_example_income
and sc::config$schemas$anon_example_house_prices
at granularity_geo="county"
to run a regression analysis to identify if any counties have higher new house prices than expected, adjusted for the median household income for all households.
https://github.com/folkehelseinstituttet/scexample/blob/main/R/04_tasks.r#L129-L155
129 | ## > household_incomes_and_house_prices_fit_model_and_find_outliers ----
130 | # tm_run_task("household_incomes_and_house_prices_fit_model_and_find_outliers")
131 | sc::add_task_from_config_v8(
132 | name_grouping = "household_incomes_and_house_prices",
133 | name_action = "fit_model_and_find_outliers",
134 | name_variant = NULL,
135 | cores = 1,
136 | plan_analysis_fn_name = NULL,
137 | for_each_plan = plnr::expand_list(
138 | x = 1
139 | ),
140 | for_each_analysis = NULL,
141 | universal_argset = NULL,
142 | upsert_at_end_of_each_plan = FALSE,
143 | insert_at_end_of_each_plan = FALSE,
144 | action_fn_name = "scexample::household_incomes_and_house_prices_fit_model_and_find_outliers_action",
145 | data_selector_fn_name = "scexample::household_incomes_and_house_prices_fit_model_and_find_outliers_data_selector",
146 | schema = list(
147 | # input
148 | "anon_example_income" = sc::config$schemas$anon_example_income,
149 | "anon_example_house_prices" = sc::config$schemas$anon_example_house_prices,
150 |
151 | # output
152 | "anon_example_house_prices_outliers_after_adjusting_for_income" = sc::config$schemas$anon_example_house_prices_outliers_after_adjusting_for_income
153 | ),
154 | info = "This task downloads and cleans the raw data and aggregates it to county and national level"
155 | )
Create a data_selector_fn that selects the data from sc::config$schemas$anon_example_income
and sc::config$schemas$anon_example_house_prices
at granularity_geo="county"
https://github.com/folkehelseinstituttet/scexample/blob/main/R/household_incomes_and_house_prices_fit_model_and_find_outliers.r#L81-L198
81 | # **** data_selector **** ----
82 | #' household_incomes_and_house_prices_fit_model_and_find_outliers (data selector)
83 | #' @param argset Argset
84 | #' @param schema DB Schema
85 | #' @export
86 | household_incomes_and_house_prices_fit_model_and_find_outliers_data_selector = function(argset, schema){
87 | if(plnr::is_run_directly()){
88 | # sc::tm_get_plans_argsets_as_dt("household_incomes_and_house_prices_fit_model_and_find_outliers")
89 |
90 | index_plan <- 1
91 |
92 | argset <- sc::tm_get_argset("household_incomes_and_house_prices_fit_model_and_find_outliers", index_plan = index_plan)
93 | schema <- sc::tm_get_schema("household_incomes_and_house_prices_fit_model_and_find_outliers")
94 | }
95 |
96 | # The database schemas can be accessed here
97 | # schema$anon_example_income$print_dplyr_select()
98 | d_income <- schema$anon_example_income$tbl() %>%
99 | sc::mandatory_db_filter(
100 | granularity_time = "calyear",
101 | granularity_time_not = NULL,
102 | granularity_geo = "county",
103 | granularity_geo_not = NULL,
104 | country_iso3 = NULL,
105 | location_code = NULL,
106 | age = "total",
107 | age_not = NULL,
108 | sex = "total",
109 | sex_not = NULL
110 | ) %>%
111 | dplyr::filter(calyear %in% 2000:2019) %>%
112 | dplyr::select(
113 | granularity_time,
114 | granularity_geo,
115 | # country_iso3,
116 | location_code,
117 | # border,
118 | age,
119 | sex,
120 | #
121 | # date,
122 | #
123 | # isoyear,
124 | # isoweek,
125 | # isoyearweek,
126 | # season,
127 | # seasonweek,
128 |
129 | calyear,
130 | # calmonth,
131 | # calyearmonth,
132 |
133 | household_income_median_all_households_nok
134 | # household_income_median_singles_nok,
135 | # household_income_median_couples_without_children_nok,
136 | # household_income_median_couples_with_children_nok,
137 | # household_income_median_single_with_children_nok
138 | ) %>%
139 | dplyr::collect() %>%
140 | as.data.table() %>%
141 | setorder(
142 | location_code,
143 | calyear
144 | )
145 |
146 | # schema$anon_example_house_prices$print_dplyr_select()
147 | d_price <- schema$anon_example_house_prices$tbl() %>%
148 | sc::mandatory_db_filter(
149 | granularity_time = "calyear",
150 | granularity_time_not = NULL,
151 | granularity_geo = "county",
152 | granularity_geo_not = NULL,
153 | country_iso3 = NULL,
154 | location_code = NULL,
155 | age = "total",
156 | age_not = NULL,
157 | sex = "total",
158 | sex_not = NULL
159 | ) %>%
160 | dplyr::filter(calyear %in% 2000:2019) %>%
161 | dplyr::select(
162 | # granularity_time,
163 | # granularity_geo,
164 | # country_iso3,
165 | location_code,
166 | # border,
167 | # age,
168 | # sex,
169 | #
170 | # date,
171 | #
172 | # isoyear,
173 | # isoweek,
174 | # isoyearweek,
175 | # season,
176 | # seasonweek,
177 |
178 | calyear,
179 | # calmonth,
180 | # calyearmonth,
181 |
182 | new_house_price_per_m2_nok
183 | # used_house_price_per_m2_nok
184 | ) %>%
185 | dplyr::collect() %>%
186 | as.data.table() %>%
187 | setorder(
188 | location_code,
189 | calyear
190 | )
191 |
192 | # The variable returned must be a named list
193 | retval <- list(
194 | "income" = d_income,
195 | "price" = d_price
196 | )
197 | retval
198 | }
Create an action_fn that:
new_house_price_per_m2_nok
, exposure household_income_median_all_households_nok
and random intercept (1|location_code)
new_house_price_per_m2_baseline_nok
(expected new house price, adjusted for the median household income for all households), new_house_price_per_m2_nok_predinterval_q97x5
(97.5 quantile in a prediction interval for new house price, adjusted for the median household income for all households), new_house_price_per_m2_nok_predinterval_q02x5
(2.5 quantile in a prediction interval for new house price, adjusted for the median household income for all households)https://github.com/folkehelseinstituttet/scexample/blob/main/R/household_incomes_and_house_prices_fit_model_and_find_outliers.r#L1-L79
1 | # **** action **** ----
2 | #' household_incomes_and_house_prices_fit_model_and_find_outliers (action)
3 | #' @param data Data
4 | #' @param argset Argset
5 | #' @param schema DB Schema
6 | #' @export
7 | household_incomes_and_house_prices_fit_model_and_find_outliers_action <- function(data, argset, schema) {
8 | # tm_run_task("household_incomes_and_house_prices_fit_model_and_find_outliers")
9 |
10 | if(plnr::is_run_directly()){
11 | # sc::tm_get_plans_argsets_as_dt("household_incomes_and_house_prices_fit_model_and_find_outliers")
12 |
13 | index_plan <- 1
14 | index_analysis <- 1
15 |
16 | data <- sc::tm_get_data("household_incomes_and_house_prices_fit_model_and_find_outliers", index_plan = index_plan)
17 | argset <- sc::tm_get_argset("household_incomes_and_house_prices_fit_model_and_find_outliers", index_plan = index_plan, index_analysis = index_analysis)
18 | schema <- sc::tm_get_schema("household_incomes_and_house_prices_fit_model_and_find_outliers")
19 | }
20 |
21 | # code goes here
22 | # special case that runs before everything
23 | if(argset$first_analysis == TRUE){
24 |
25 | }
26 |
27 | d <- merge(
28 | data$income,
29 | data$price,
30 | by = c("location_code", "calyear")
31 | )
32 |
33 | d_pred <- d |> {\(x)
34 | lme4::lmer(
35 | new_house_price_per_m2_nok ~
36 | household_income_median_all_households_nok +
37 | (1|location_code),
38 | data = x
39 | )}() |>
40 | merTools::predictInterval(d, level = 0.95) |>
41 | {\(x) cbind(d, x)}() |>
42 | data.table()
43 |
44 | setnames(
45 | d_pred,
46 | c(
47 | "fit",
48 | "upr",
49 | "lwr"
50 | ),
51 | c(
52 | "new_house_price_per_m2_baseline_nok",
53 | "new_house_price_per_m2_nok_predinterval_q97x5",
54 | "new_house_price_per_m2_nok_predinterval_q02x5"
55 | )
56 | )
57 |
58 | d_pred[, new_house_price_per_m2_nok_status := "normal"]
59 | d_pred[new_house_price_per_m2_baseline_nok > new_house_price_per_m2_nok_predinterval_q97x5, new_house_price_per_m2_nok_status := "high"]
60 |
61 | xtabs(~d_pred$new_house_price_per_m2_nok_status)
62 |
63 | # put data in db table
64 | sc::fill_in_missing_v8(d_pred, border = config$border)
65 | schema$anon_example_house_prices_outliers_after_adjusting_for_income$drop_all_rows_and_then_insert_data(d_pred)
66 |
67 | # check that it uploaded
68 | nrow(d_pred)
69 | schema$anon_example_house_prices_outliers_after_adjusting_for_income$tbl() |> dplyr::summarize(n()) |> dplyr::collect()
70 |
71 | # special case that runs after everything
72 | # copy to anon_web?
73 | if(argset$last_analysis == TRUE){
74 | # sc::copy_into_new_table_where(
75 | # table_from = "anon_X",
76 | # table_to = "anon_webkht"
77 | # )
78 | }
79 | }
2021-07-15: Draft created.
If you see mistakes or want to suggest changes, please create an issue on the source repository.
Text and figures are licensed under Creative Commons Attribution CC BY 4.0. Source code is available at https://github.com/folkehelseinstituttet/sykdomspulsen-dokumentasjon, unless otherwise noted. The figures that have been reused from other sources don't fall under this license and can be recognized by a note in their caption: "Figure from ...".