url.js 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. 'use strict'
  2. var ccount = require('ccount')
  3. var decode = require('parse-entities')
  4. var decimal = require('is-decimal')
  5. var alphabetical = require('is-alphabetical')
  6. var whitespace = require('is-whitespace-character')
  7. var locate = require('../locate/url')
  8. module.exports = url
  9. url.locator = locate
  10. url.notInLink = true
  11. var exclamationMark = 33 // '!'
  12. var ampersand = 38 // '&'
  13. var rightParenthesis = 41 // ')'
  14. var asterisk = 42 // '*'
  15. var comma = 44 // ','
  16. var dash = 45 // '-'
  17. var dot = 46 // '.'
  18. var colon = 58 // ':'
  19. var semicolon = 59 // ';'
  20. var questionMark = 63 // '?'
  21. var lessThan = 60 // '<'
  22. var underscore = 95 // '_'
  23. var tilde = 126 // '~'
  24. var leftParenthesisCharacter = '('
  25. var rightParenthesisCharacter = ')'
  26. function url(eat, value, silent) {
  27. var self = this
  28. var gfm = self.options.gfm
  29. var tokenizers = self.inlineTokenizers
  30. var length = value.length
  31. var previousDot = -1
  32. var protocolless = false
  33. var dots
  34. var lastTwoPartsStart
  35. var start
  36. var index
  37. var pathStart
  38. var path
  39. var code
  40. var end
  41. var leftCount
  42. var rightCount
  43. var content
  44. var children
  45. var url
  46. var exit
  47. if (!gfm) {
  48. return
  49. }
  50. // `WWW.` doesn’t work.
  51. if (value.slice(0, 4) === 'www.') {
  52. protocolless = true
  53. index = 4
  54. } else if (value.slice(0, 7).toLowerCase() === 'http://') {
  55. index = 7
  56. } else if (value.slice(0, 8).toLowerCase() === 'https://') {
  57. index = 8
  58. } else {
  59. return
  60. }
  61. // Act as if the starting boundary is a dot.
  62. previousDot = index - 1
  63. // Parse a valid domain.
  64. start = index
  65. dots = []
  66. while (index < length) {
  67. code = value.charCodeAt(index)
  68. if (code === dot) {
  69. // Dots may not appear after each other.
  70. if (previousDot === index - 1) {
  71. break
  72. }
  73. dots.push(index)
  74. previousDot = index
  75. index++
  76. continue
  77. }
  78. if (
  79. decimal(code) ||
  80. alphabetical(code) ||
  81. code === dash ||
  82. code === underscore
  83. ) {
  84. index++
  85. continue
  86. }
  87. break
  88. }
  89. // Ignore a final dot:
  90. if (code === dot) {
  91. dots.pop()
  92. index--
  93. }
  94. // If there are not dots, exit.
  95. if (dots[0] === undefined) {
  96. return
  97. }
  98. // If there is an underscore in the last two domain parts, exit:
  99. // `www.example.c_m` and `www.ex_ample.com` are not OK, but
  100. // `www.sub_domain.example.com` is.
  101. lastTwoPartsStart = dots.length < 2 ? start : dots[dots.length - 2] + 1
  102. if (value.slice(lastTwoPartsStart, index).indexOf('_') !== -1) {
  103. return
  104. }
  105. /* istanbul ignore if - never used (yet) */
  106. if (silent) {
  107. return true
  108. }
  109. end = index
  110. pathStart = index
  111. // Parse a path.
  112. while (index < length) {
  113. code = value.charCodeAt(index)
  114. if (whitespace(code) || code === lessThan) {
  115. break
  116. }
  117. index++
  118. if (
  119. code === exclamationMark ||
  120. code === asterisk ||
  121. code === comma ||
  122. code === dot ||
  123. code === colon ||
  124. code === questionMark ||
  125. code === underscore ||
  126. code === tilde
  127. ) {
  128. // Empty
  129. } else {
  130. end = index
  131. }
  132. }
  133. index = end
  134. // If the path ends in a closing paren, and the count of closing parens is
  135. // higher than the opening count, then remove the supefluous closing parens.
  136. if (value.charCodeAt(index - 1) === rightParenthesis) {
  137. path = value.slice(pathStart, index)
  138. leftCount = ccount(path, leftParenthesisCharacter)
  139. rightCount = ccount(path, rightParenthesisCharacter)
  140. while (rightCount > leftCount) {
  141. index = pathStart + path.lastIndexOf(rightParenthesisCharacter)
  142. path = value.slice(pathStart, index)
  143. rightCount--
  144. }
  145. }
  146. if (value.charCodeAt(index - 1) === semicolon) {
  147. // GitHub doesn’t document this, but final semicolons aren’t paret of the
  148. // URL either.
  149. index--
  150. // // If the path ends in what looks like an entity, it’s not part of the path.
  151. if (alphabetical(value.charCodeAt(index - 1))) {
  152. end = index - 2
  153. while (alphabetical(value.charCodeAt(end))) {
  154. end--
  155. }
  156. if (value.charCodeAt(end) === ampersand) {
  157. index = end
  158. }
  159. }
  160. }
  161. content = value.slice(0, index)
  162. url = decode(content, {nonTerminated: false})
  163. if (protocolless) {
  164. url = 'http://' + url
  165. }
  166. exit = self.enterLink()
  167. // Temporarily remove all tokenizers except text in url.
  168. self.inlineTokenizers = {text: tokenizers.text}
  169. children = self.tokenizeInline(content, eat.now())
  170. self.inlineTokenizers = tokenizers
  171. exit()
  172. return eat(content)({type: 'link', title: null, url: url, children: children})
  173. }