-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathnormalizers.go
107 lines (86 loc) · 3.3 KB
/
normalizers.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package main
import (
"net/url"
"regexp"
"strings"
"golang.org/x/net/idna"
"github.com/usher2/u2ckdump/internal/logger"
)
// NormalizeDomain takes a domain name string containing misprints and
// attempts to construct the correct domain name. It trims unnecessary characters,
// replaces common errors, and converts the domain to ASCII and lowercase.
// If there is an error during the conversion to ASCII, it is ignored and the original
// domain is returned instead.
func NormalizeDomain(domain string) string {
// Remove the protocol or its misspellings, if present
domain = removeMisspelledProtocol(domain)
// Remove any content after the first '/' or '\' character.
domain = trimAfterChar(domain, '/')
domain = trimAfterChar(domain, '\\')
// Replace common misprints and unnecessary characters.
domain = strings.Replace(domain, ",", ".", -1)
domain = strings.Replace(domain, " ", "", -1)
// domain = strings.Replace(domain, "_", "-", -1) // Replace underscore with hyphen.
// domain = strings.Replace(domain, "wwww", "www", -1) // Fix common "wwww" misprint.
domain = strings.TrimPrefix(domain, "*.")
domain = strings.TrimSuffix(domain, ".")
// Convert domain to ASCII and ignore any errors.
asciiDomain, _ := idna.ToASCII(domain)
// Convert domain to lowercase.
lowerDomain := strings.ToLower(asciiDomain)
return lowerDomain
}
// NormalizeURL takes a URL string containing misprints and
// attempts to construct the correct URL. It fixes common misprints,
// normalizes the domain using the NormalizeDomain function, and
// removes any URL fragments.
func NormalizeURL(u string) string {
// Fix the misspelled protocol, if present
u = replaceMisspelledProtocol(u)
// Replace backslashes with forward slashes.
u = strings.Replace(u, "\\", "/", -1)
// Parse the URL.
nurl, err := url.Parse(u)
if err != nil {
// Log the error and return the original URL if parsing fails.
logger.Error.Printf("URL parse error: %s\n", err)
return u
}
// Normalize the domain.
domain := nurl.Hostname()
port := nurl.Port()
nurl.Host = NormalizeDomain(domain)
// Add the port back to the normalized domain, if present.
if port != "" {
nurl.Host = nurl.Host + ":" + port
}
// Remove any URL fragments.
nurl.Fragment = ""
// Return the normalized URL.
return nurl.String()
}
// protocolPattern - regexp for remove misspelled protocol.
var protocolPattern = regexp.MustCompile(`^(https?):?[/\\]*|^(http?):?[/\\]*|^//`)
// removeMisspelledProtocol removes common misspellings of the "http://" or "https://" prefix
// from the input domain string if it is present.
func removeMisspelledProtocol(s string) string {
return protocolPattern.ReplaceAllString(s, "")
}
// replaceMisspelledProtocol replaces common misspellings of the "http://" or "https://" prefix
// in the input URL string with the correct protocol.
func replaceMisspelledProtocol(s string) string {
return protocolPattern.ReplaceAllStringFunc(s, func(match string) string {
if strings.Contains(match, "https") {
return "https://"
}
return "http://"
})
}
// trimAfterChar trims the input string after the first occurrence of the specified character.
// If the character is not found, the original string is returned.
func trimAfterChar(s string, char byte) string {
if idx := strings.IndexByte(s, char); idx >= 0 {
return s[:idx]
}
return s
}