Skip to content

Commit

Permalink
1.6.2
Browse files Browse the repository at this point in the history
  • Loading branch information
Oliver Keyes committed Sep 2, 2016
1 parent 0b14d73 commit 0a8ffe2
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 12 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: urltools
Type: Package
Title: Vectorised Tools for URL Handling and Parsing
Version: 1.5.1
Date: 2016-08-30
Version: 1.5.2
Date: 2016-09-01
Author: Oliver Keyes [aut, cre], Jay Jacobs [aut, cre], Mark Greenaway [ctb],
Bob Rudis [ctb], Alex Pinto [ctb], Maryam Khezrzadeh [ctb]
Maintainer: Oliver Keyes <[email protected]>
Expand Down
7 changes: 6 additions & 1 deletion NEWS
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
Version 1.5.1 [WIP]
Version 1.5.2
-------------------------------------------------------------------------

BUGS
* Custom suffix lists were not working properly.
Version 1.5.1
-------------------------------------------------------------------------

BUGS
Expand Down
13 changes: 9 additions & 4 deletions R/suffix.R
Original file line number Diff line number Diff line change
Expand Up @@ -142,12 +142,17 @@ suffix_extract <- function(domains, suffixes = NULL){
stop("Expected column named \"suffixes\" in suffixes data.frame")
}
}
suffix_load(suffixes)
holding <- suffix_load(suffixes)
} else {
holding <- list(suff_trie = urltools_env$suff_trie,
is_wildcard = urltools_env$is_wildcard,
cleaned_suffixes = urltools_env$cleaned_suffixes)
}

rev_domains <- reverse_strings(tolower(domains))
matched_suffixes <- triebeard::longest_match(urltools_env$suff_trie, rev_domains)
has_wildcard <- matched_suffixes %in% urltools_env$is_wildcard
is_suffix = domains %in% urltools_env$cleaned_suffixes
matched_suffixes <- triebeard::longest_match(holding$suff_trie, rev_domains)
has_wildcard <- matched_suffixes %in% holding$is_wildcard
is_suffix <- domains %in% holding$cleaned_suffixes
return(finalise_suffixes(domains, matched_suffixes, has_wildcard, is_suffix))
}

Expand Down
13 changes: 8 additions & 5 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,19 @@ suffix_load <- function(suffixes = NULL){
if(is.null(suffixes)){
suffixes <- urltools::suffix_dataset
}
cleaned_suffixes <- gsub(x = urltools::suffix_dataset, pattern = "*.", replacement = "", fixed = TRUE)
cleaned_suffixes <- gsub(x = suffixes, pattern = "*.", replacement = "", fixed = TRUE)
is_wildcard <- cleaned_suffixes[which(grepl(x = urltools::suffix_dataset, pattern = "*.", fixed = TRUE))]
assign("is_wildcard", is_wildcard, envir = urltools_env)
assign("cleaned_suffixes", cleaned_suffixes, envir = urltools_env)
suff_trie <- triebeard::trie(keys = reverse_strings(paste0(".", cleaned_suffixes)),
values = cleaned_suffixes)
assign("suff_trie", suff_trie, envir = urltools_env)
return(list(suff_trie = suff_trie,
is_wildcard = is_wildcard,
cleaned_suffixes = cleaned_suffixes))
return(invisible())
}

.onLoad <- function(...) {
suffix_load()
holding <- suffix_load()
assign("is_wildcard", holding$is_wildcard, envir = urltools_env)
assign("cleaned_suffixes", holding$cleaned_suffixes, envir = urltools_env)
assign("suff_trie", holding$suff_trie, envir = urltools_env)
}
25 changes: 25 additions & 0 deletions tests/testthat/test_suffixes.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,31 @@ test_that("Suffix extraction works with new suffixes",{
expect_that(names(result), equals(c("host","subdomain","domain","suffix")))
expect_that(nrow(result), equals(1))

expect_that(result$subdomain[1], equals("en"))
expect_that(result$domain[1], equals("wikipedia"))
expect_that(result$suffix[1], equals("org"))
})

test_that("Suffix extraction works with an arbitrary suffixes database (to ensure it is loading it)",{
result <- suffix_extract(c("is-this-a.bananaboat", "en.wikipedia.org"), data.frame(suffixes = "bananaboat"))
expect_that(ncol(result), equals(4))
expect_that(names(result), equals(c("host","subdomain","domain","suffix")))
expect_that(nrow(result), equals(2))

expect_equal(result$subdomain[1], NA_character_)
expect_equal(result$domain[1], "is-this-a")
expect_equal(result$suffix[1], "bananaboat")
expect_equal(result$subdomain[2], NA_character_)
expect_equal(result$domain[2], NA_character_)
expect_equal(result$suffix[2], NA_character_)
})

test_that("Suffix extraction is back to normal using the internal database when it receives suffixes=NULL",{
result <- suffix_extract("en.wikipedia.org")
expect_that(ncol(result), equals(4))
expect_that(names(result), equals(c("host","subdomain","domain","suffix")))
expect_that(nrow(result), equals(1))

expect_that(result$subdomain[1], equals("en"))
expect_that(result$domain[1], equals("wikipedia"))
expect_that(result$suffix[1], equals("org"))
Expand Down

0 comments on commit 0a8ffe2

Please sign in to comment.