Added link validator. (#13)

amolkahat · raukadah · commit 1ba75ef42cbf · 2019-12-07T15:05:18.000+05:30
- It will check link is valid or not.
- If link is like `https://&lt;hostname&gt;:&lt;port&gt;` then it should mark
it as skip.

Signed-off-by: Amol Kahat &lt;amolkahat@gmail.com&gt;
diff --git a/linkstatus/linkstatus.py b/linkstatus/linkstatus.py
@@ -5,6 +5,7 @@
 import click
 import requests
 
+from linkstatus.parser import link_validator
 from linkstatus.parser import parse_file
 
 
@@ -61,7 +62,7 @@ def main(source, recursive, timeout, retry):
 
     for f in files:
         links = parse_file(f)
-
+        links = link_validator(links)
         if links:
             click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white"))
 
diff --git a/linkstatus/parser.py b/linkstatus/parser.py
@@ -3,10 +3,9 @@
 
 import markdown
 
-
 REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
 
-LINKS = namedtuple("LINKS", ["line", "urls", "skip"])
+LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"])
 
 
 def parse_line(line):
@@ -42,5 +41,37 @@ def parse_file(file_path):
             line_links = parse_line(line)
             if line_links:
                 skip = True if "noqa" in line else False
-                links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip))
+                links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False))
     return links
+
+
+def link_validator(links_list):
+    """Validate link
+    Args:
+        links_list: List of links.
+
+    Return:
+        Named tuple of the valid and invalid links.
+    """
+    validated_list = []
+
+    regex = re.compile(
+        r"^(?:http|ftp)s?://"  # http:// or https://
+        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
+        # for domain
+        r"localhost|"  # localhost...
+        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
+        r"(?::\d+)?"  # optional port
+        r"(?:/?|[/?]\S+)$",
+        re.IGNORECASE,
+    )
+
+    for link in links_list:
+        urls = []
+        for i in link.urls:
+            if re.match(regex, i):
+                urls.append(i)
+            else:
+                validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True))
+        validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True))
+    return validated_list
diff --git a/tests/dir/links_markdown.md b/tests/dir/links_markdown.md
@@ -32,3 +32,11 @@ Some text to show that the reference links can follow later.
 [link text itself]: http://www.reddit.com <!--noqa-->
 
 [broken link](https://github.com/pythonpune/linkstatus)
+
+https://github.com//pythonpune/
+
+http://<hostname>:<port>
+
+https://<hostname>:<port>/pages
+
+file:///etc/hosts