|
| 1 | +package aggregation |
| 2 | + |
| 3 | +/* |
| 4 | +Regular expression for extracting URLs from https://github.com/mvdan/xurls |
| 5 | +
|
| 6 | +Copyright (c) 2015, Daniel Martí. All rights reserved. |
| 7 | +
|
| 8 | +Redistribution and use in source and binary forms, with or without |
| 9 | +modification, are permitted provided that the following conditions are |
| 10 | +met: |
| 11 | +
|
| 12 | + * Redistributions of source code must retain the above copyright |
| 13 | +notice, this list of conditions and the following disclaimer. |
| 14 | + * Redistributions in binary form must reproduce the above |
| 15 | +copyright notice, this list of conditions and the following disclaimer |
| 16 | +in the documentation and/or other materials provided with the |
| 17 | +distribution. |
| 18 | + * Neither the name of the copyright holder nor the names of its |
| 19 | +contributors may be used to endorse or promote products derived from |
| 20 | +this software without specific prior written permission. |
| 21 | +
|
| 22 | +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 23 | +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 24 | +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 25 | +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 26 | +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 27 | +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 28 | +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 29 | +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 30 | +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 31 | +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 32 | +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 33 | +*/ |
| 34 | +import ( |
| 35 | + "net/url" |
| 36 | + "regexp" |
| 37 | +) |
| 38 | + |
| 39 | +const allowedUcsChar = "¡-ᙿᚁ-\u1fff\u200b-‧\u202a-\u202e‰-⁞\u2060-\u2fff、-\ud7ff豈-﷏ﷰ-\uffef𐀀-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd" |
| 40 | +const allowedUcsCharMinusPunctuation = "¢-¦¨-µ¸-¾À-ͽͿ-ΆΈ-ՙՠ-ֈ֊-ֿׁ-ׂׄ-ׇׅ-ײ\u05f5-؈؋؎-ؚ\u061cؠ-٩ٮ-ۓە-ۿ\u070e-߶ߺ-\u082f\u083f-\u085d\u085f-ॣ०-९ॱ-ৼ৾-ੵ\u0a77-૯૱-\u0c76౸-ಃಅ-ෳ\u0df5-๎๐-๙\u0e5c-༃༓༕-྄྆-࿏࿕-࿘\u0fdb-၉ၐ-ჺჼ-፟፩-᙭ᙯ-ᙿᚁ-ᛪᛮ-᜴\u1737-៓ៗ៛-\u17ff᠆᠋-\u1943᥆-\u1a1dᨠ-\u1a9fᪧ\u1aae-᭙᭡-᭼\u1b7f-\u1bfbᰀ-\u1c3a᱀-ᱽᲀ-Ჿ\u1cc8-᳔᳒-\u1fff\u200b-―‘-‟\u202a-\u202e‹-›‿-⁀⁄-⁆⁒⁔\u2060-\u2cf8⳽ⴀ-ⵯ\u2d71-ⷿ⸂-⸅⸉-⸊⸌-⸍⸗⸚⸜-⸝⸠-⸩ⸯ⸺-⸻⹀⹂⹐-⹑⹕-\u2fff〄-〼〾-ヺー-ꓽꔀ-ꘌꘐ-꙲ꙴ-꙽ꙿ-꛱\ua6f8-ꡳ\ua878-\ua8cd꣐-ꣷꣻꣽ-꤭ꤰ-\ua95eꥠ-꧀\ua9ce-\ua9ddꧠ-\uaa5bꩠ-ꫝꫠ-ꫯꫲ-ꯪ꯬-\ud7ff豈-﷏ﷰ-️︗-︘\ufe1a-︯︱-﹄﹇-﹈﹍-﹏\ufe53﹘-﹞﹢-\ufe67﹩\ufe6c-\uff00$(-)+-0-9<->A-[]-⦆「-」ヲ-\uffef𐀀-\U000100ff\U00010103-\U0001039e𐎠-𐏏𐏑-\U0001056e𐕰-\U00010856𐡘-\U0001091e𐤠-\U0001093e\U00010940-\U00010a4f\U00010a59-𐩾𐪀-𐫯\U00010af7-\U00010b38𐭀-\U00010b98\U00010b9d-𐽔\U00010f5a-𐾅\U00010f8a-𑁆\U0001104e-𑂺\U000110bd𑃂-𑄿𑅄-𑅳𑅶-𑇄𑇉-𑇌𑇎-𑇚𑇜\U000111e0-𑈷𑈾-𑊨\U000112aa-𑑊𑑐-𑑙\U0001145c𑑞-𑓅𑓇-𑗀𑗘-𑙀𑙄-\U0001165f\U0001166d-𑚸\U000116ba-𑜻𑜿-𑠺\U0001183c-𑥃\U00011947-𑧡𑧣-𑨾𑩇-𑪙𑪝\U00011aa3-\U00011aff\U00011b0a-𑱀\U00011c46-\U00011c6f𑱲-𑻶\U00011ef9-𑽂𑽐-\U00011ffe𒀀-\U0001246f\U00012475-𒿰\U00012ff3-\U00016a6d𖩰-𖫴\U00016af6-𖬶𖬼-𖭃𖭅-𖺖\U00016e9b-𖿡𖿣-𛲞\U0001bca0-𝪆\U0001da8c-\U0001e95d\U0001e960-\U0001fffd𠀀-\U0002fffd𰀀-\U0003fffd\U00040000-\U0004fffd\U00050000-\U0005fffd\U00060000-\U0006fffd\U00070000-\U0007fffd\U00080000-\U0008fffd\U00090000-\U0009fffd\U000a0000-\U000afffd\U000b0000-\U000bfffd\U000c0000-\U000cfffd\U000d0000-\U000dfffd\U000e1000-\U000efffd" |
| 41 | + |
| 42 | +const ( |
| 43 | + unreservedChar = `a-zA-Z0-9\-._~` |
| 44 | + endUnreservedChar = `a-zA-Z0-9\-_~` |
| 45 | + midSubDelimChar = `!$&'*+,;=` |
| 46 | + endSubDelimChar = `$&+=` |
| 47 | + midIPathSegmentChar = unreservedChar + `%` + midSubDelimChar + `:@` + allowedUcsChar |
| 48 | + endIPathSegmentChar = endUnreservedChar + `%` + endSubDelimChar + allowedUcsCharMinusPunctuation |
| 49 | + iPrivateChar = `\x{E000}-\x{F8FF}\x{F0000}-\x{FFFFD}\x{100000}-\x{10FFFD}` |
| 50 | + midIChar = `/?#\\` + midIPathSegmentChar + iPrivateChar |
| 51 | + endIChar = `/#` + endIPathSegmentChar + iPrivateChar |
| 52 | + wellParen = `\((?:[` + midIChar + `]|\([` + midIChar + `]*\))*\)` |
| 53 | + wellBracket = `\[(?:[` + midIChar + `]|\[[` + midIChar + `]*\])*\]` |
| 54 | + wellBrace = `\{(?:[` + midIChar + `]|\{[` + midIChar + `]*\})*\}` |
| 55 | + wellAll = wellParen + `|` + wellBracket + `|` + wellBrace |
| 56 | + pathCont = `(?:[` + midIChar + `]*(?:` + wellAll + `|[` + endIChar + `]))+` |
| 57 | + schemes = `(?:(?i)(?:http|https)://)` |
| 58 | +) |
| 59 | + |
| 60 | +func extractUrls(text string) []string { |
| 61 | + re := regexp.MustCompile(schemes + pathCont) |
| 62 | + re.Longest() |
| 63 | + |
| 64 | + var urls []string |
| 65 | + for _, match := range re.FindAllString(text, -1) { |
| 66 | + if _, err := url.Parse(match); err == nil { |
| 67 | + urls = append(urls, match) |
| 68 | + } |
| 69 | + } |
| 70 | + return urls |
| 71 | +} |
0 commit comments