Untitled

def _extract_url_links(html):
"""extract url links
>>> _extract_url_links('aa<a href="link1">link1</a>bb<a href="link2">link2</a>cc')
['link1', 'link2']
"""
#"html.parser"はなるべくpython標準のparserモジュールを使うように指定しているBeautifulSoup()で
#BeautifulSoupで扱えるようにしている。
all_url = []
body_soup = BeautifulSoup(html, "html.parser").find('body')
#aタグを全て持ってくる。
for child_tag in body_soup.findChildren():
    if child_tag.get('href') is not None:
        if '#' not in child_tag.get('href'):#or '.png' or '.jpg' or '.gif'
            if '.jpg' not in child_tag.get('href'):
                if '.png' not in child_tag.get('href'):
                    if '.gif' not in child_tag.get('href'):
                        all_url.append(child_tag.get('href'))
return all_url

def _extract_url_links(html):
"""extract url links
>>> _extract_url_links('aa<a href="link1">link1</a>bb<a href="link2">link2</a>cc')
['link1', 'link2']
"""
#"html.parser"はなるべくpython標準のparserモジュールを使うように指定しているBeautifulSoup()で
#BeautifulSoupで扱えるようにしている。
all_url = []
body_soup = BeautifulSoup(html, "html.parser").find('body')
#aタグを全て持ってくる。
for child_tag in body_soup.findChildren():
    if child_tag.get('href') is not None:
        if '#' not in child_tag.get('href') or '.jpg' not in child_tag.get('href') or '.png' not in child_tag.get('href') or'.gif' not in child_tag.get('href'):
return all_url

import re
 m = re.search(r'ここの引数をどうしたらいいのかわかりません',child_tag.get('href'))
 if m is not None;
    all_url.append(child_tag.get('href'))

#!/usr/bin/python3
# python2ではurlparseというモジュール名でした
from urllib.parse import urlparse

# テスト用URL
urls = (
    "https://example.com",
    "https://example.com/abc.html",
    "https://example.com/abc.html#top",
    "https://example.com/abc.jpg",
    "https://example.com/abc.jpg?v=123",
    "https://www.gift.example.com/",
)

img_suffixes = (".png", ".jpg", ".gif")
all_url = []
for url in urls:
    url_parts = urlparse(url)
    if not url_parts.fragment and not url_parts.path.endswith(img_suffixes):
        all_url.append(url)
print(all_url)

['https://example.com', 'https://example.com/abc.html', 'https://www.gift.example.com/']