Skip to content

Commit ffc5109

Browse files
committed
Add url extraction code
1 parent c548bcc commit ffc5109

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

pkg/aggregation/urlextraction.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
package aggregation
2+
3+
/*
4+
Regular expression for extracting URLs from https://github.com/mvdan/xurls
5+
6+
Copyright (c) 2015, Daniel Martí. All rights reserved.
7+
8+
Redistribution and use in source and binary forms, with or without
9+
modification, are permitted provided that the following conditions are
10+
met:
11+
12+
* Redistributions of source code must retain the above copyright
13+
notice, this list of conditions and the following disclaimer.
14+
* Redistributions in binary form must reproduce the above
15+
copyright notice, this list of conditions and the following disclaimer
16+
in the documentation and/or other materials provided with the
17+
distribution.
18+
* Neither the name of the copyright holder nor the names of its
19+
contributors may be used to endorse or promote products derived from
20+
this software without specific prior written permission.
21+
22+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23+
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24+
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25+
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26+
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27+
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28+
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29+
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30+
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31+
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33+
*/
34+
import (
35+
"net/url"
36+
"regexp"
37+
)
38+
39+
const allowedUcsChar = "¡-ᙿᚁ-\u1fff\u200b-‧\u202a-\u202e‰-⁞\u2060-\u2fff、-\ud7ff豈-﷏ﷰ-\uffef𐀀-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
40+
const allowedUcsCharMinusPunctuation = "¢-¦¨-µ¸-¾À-ͽͿ-ΆΈ-ՙՠ-ֈ֊-ֿׁ-ׂׄ-ׇׅ-ײ\u05f5-؈؋؎-ؚ\u061cؠ-٩ٮ-ۓە-ۿ\u070e-߶ߺ-\u082f\u083f-\u085d\u085f-ॣ०-९ॱ-ৼ৾-ੵ\u0a77-૯૱-\u0c76౸-ಃಅ-ෳ\u0df5-๎๐-๙\u0e5c-༃༓༕-྄྆-࿏࿕-࿘\u0fdb-၉ၐ-ჺჼ-፟፩-᙭ᙯ-ᙿᚁ-ᛪᛮ-᜴\u1737-៓ៗ៛-\u17ff᠆᠋-\u1943᥆-\u1a1dᨠ-\u1a9f\u1aae-᭙᭡-᭼\u1b7f-\u1bfbᰀ-\u1c3a᱀-ᱽᲀ-Ჿ\u1cc8-᳔᳒-\u1fff\u200b-―‘-‟\u202a-\u202e‹-›‿-⁀⁄-⁆⁒⁔\u2060-\u2cf8⳽ⴀ-ⵯ\u2d71-ⷿ⸂-⸅⸉-⸊⸌-⸍⸗⸚⸜-⸝⸠-⸩ⸯ⸺-⸻⹀⹂⹐-⹑⹕-\u2fff〄-〼〾-ヺー-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱\ua6f8-ꡳ\ua878-\ua8cd꣐-ꣷꣻꣽ-꤭ꤰ-\ua95eꥠ-꧀\ua9ce-\ua9ddꧠ-\uaa5bꩠ-ꫝꫠ-ꫯꫲ-ꯪ꯬-\ud7ff豈-﷏ﷰ-️︗-︘\ufe1a-︯︱-﹄﹇-﹈﹍-﹏\ufe53﹘-﹞﹢-\ufe67\ufe6c-\uff00$(-)+-0-9<->A-[]-⦆「-」ヲ-\uffef𐀀-\U000100ff\U00010103-\U0001039e𐎠-𐏏𐏑-\U0001056e𐕰-\U00010856𐡘-\U0001091e𐤠-\U0001093e\U00010940-\U00010a4f\U00010a59-𐩾𐪀-𐫯\U00010af7-\U00010b38𐭀-\U00010b98\U00010b9d-𐽔\U00010f5a-𐾅\U00010f8a-𑁆\U0001104e-𑂺\U000110bd𑃂-𑄿𑅄-𑅳𑅶-𑇄𑇉-𑇌𑇎-𑇚𑇜\U000111e0-𑈷𑈾-𑊨\U000112aa-𑑊𑑐-𑑙\U0001145c𑑞-𑓅𑓇-𑗀𑗘-𑙀𑙄-\U0001165f\U0001166d-𑚸\U000116ba-𑜻𑜿-𑠺\U0001183c-𑥃\U00011947-𑧡𑧣-𑨾𑩇-𑪙𑪝\U00011aa3-\U00011aff\U00011b0a-𑱀\U00011c46-\U00011c6f𑱲-𑻶\U00011ef9-𑽂𑽐-\U00011ffe𒀀-\U0001246f\U00012475-𒿰\U00012ff3-\U00016a6d𖩰-𖫴\U00016af6-𖬶𖬼-𖭃𖭅-𖺖\U00016e9b-𖿡𖿣-𛲞\U0001bca0-𝪆\U0001da8c-\U0001e95d\U0001e960-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd"
41+
42+
const (
43+
unreservedChar = `a-zA-Z0-9\-._~`
44+
endUnreservedChar = `a-zA-Z0-9\-_~`
45+
midSubDelimChar = `!$&'*+,;=`
46+
endSubDelimChar = `$&+=`
47+
midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar
48+
endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunctuation
49+
iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}`
50+
midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar
51+
endIChar = `/#` + endIPathSegmentChar + iPrivateChar
52+
wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)`
53+
wellBracket = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]`
54+
wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}`
55+
wellAll = wellParen + `|` + wellBracket + `|` + wellBrace
56+
pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+`
57+
schemes = `(?:(?i)(?:http|https)://)`
58+
)
59+
60+
func extractUrls(text string) []string {
61+
re := regexp.MustCompile(schemes + pathCont)
62+
re.Longest()
63+
64+
var urls []string
65+
for _, match := range re.FindAllString(text, -1) {
66+
if _, err := url.Parse(match); err == nil {
67+
urls = append(urls, match)
68+
}
69+
}
70+
return urls
71+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
package aggregation
2+
3+
import (
4+
"reflect"
5+
"testing"
6+
)
7+
8+
func TestExtractUrls(t *testing.T) {
9+
tests := []struct {
10+
name string
11+
text string
12+
want []string
13+
}{
14+
{
15+
name: "single",
16+
text: "foo http://localhost bar",
17+
want: []string{"http://localhost"},
18+
},
19+
{
20+
name: "multiple",
21+
text: "foo http://localhost https://www.example.com bar",
22+
want: []string{"http://localhost", "https://www.example.com"},
23+
},
24+
{
25+
name: "complex",
26+
text: "foo https://www.example.com/x/1234?foo=bar bar",
27+
want: []string{"https://www.example.com/x/1234?foo=bar"},
28+
},
29+
}
30+
for _, tt := range tests {
31+
t.Run(tt.name, func(t *testing.T) {
32+
if got := extractUrls(tt.text); !reflect.DeepEqual(got, tt.want) {
33+
t.Errorf("extractUrls() = %v, want %v", got, tt.want)
34+
}
35+
})
36+
}
37+
}

0 commit comments

Comments
 (0)