Playing with BeautifulSoup

The starting block, which gives us a function we can throw urls at and have it return a BeautifulSoup object (one of my favourite snippets).

from bs4 import BeautifulSoup from fake_useragent import UserAgent import requests ua = UserAgent() def lovely_soup(url): r = requests.get(url, headers={'User-Agent': ua.chrome}) return BeautifulSoup(r.text, 'lxml') soup = lovely_soup(url)

Let's throw a url at it and see what it does...

soup = lovely_soup('https://recycledrobot.co.uk') print(soup)

You'll see a load of HTML fly past you. That's the source of the website we want. Now we'll use BeautifulSoup to extract an elements text. The subtitle for instance...

soup = lovely_soup('https://recycledrobot.co.uk') thing = soup.find('span', {'class': 'subtitle'}) print(thing.text)

That's pretty much the basic setup covered. Let's move onto other things. Kittens!

soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') images = soup.findAll('img', {'class': 'wallpapers__image'}) for image in images: print(image['src'])

Woo. We now have links to kitten images, but they're tiny. We'll have to follow the trail and find the original.

soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') links = soup.findAll('a', {'class': 'wallpapers__link'}) for link in links: url = 'https://wallpaperscraft.com{}'.format(link['href']) soup = lovely_soup(url) url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href'] url = 'https://wallpaperscraft.com{}'.format(url) soup = lovely_soup(url) url = soup.find('a', {'class': 'gui-button_full-height'})['href'] print(url)

We've jumped through hoops and found the original file. Now let's put all that in a function of it's own and go download all these kitten pictures to a directory called "images".

def get_img(get_url): if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')): img_name = 'images/{}'.format(get_url.split('/')[-1]) img = requests.get(get_url).content with open(img_name, 'wb') as f: f.write(img) def get_kittens(): soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') links = soup.findAll('a', {'class': 'wallpapers__link'}) for link in links: url = 'https://wallpaperscraft.com{}'.format(link['href']) soup = lovely_soup(url) url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href'] url = 'https://wallpaperscraft.com{}'.format(url) soup = lovely_soup(url) url = soup.find('a', {'class': 'gui-button_full-height'})['href'] print(url) get_img(url) get_kittens()

There's 15 pictures of kittens. Let's go get some text from somewhere. Bad news headlines!

def get_headlines(): soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/') headlines = soup.find('div', {'class': 'blog-single-inner-cont'}) headlines = headlines.findAll('strong') for headline in headlines[1:]: line = headline.text line = line.lstrip('0123456789.- ').strip() print(line) get_headlines()

Ok, now what? We save these to a json file of course...

import json def get_headlines(): soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/') headlines = soup.find('div', {'class': 'blog-single-inner-cont'}) headlines = headlines.findAll('strong') data = {} for headline in headlines[1:]: line = headline.text line = line.lstrip('0123456789.- ').strip() print(line) data.update({line: 0}) with open('data.json', 'w+') as f: json.dump(data, f) get_headlines()

Hang on... That json file is full of zero values. Best use them wisely. Let's revisit our functions and push some data around.

def get_img(get_url): if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')): img_name = 'images/{}'.format(get_url.split('/')[-1]) img = get(get_url).content with open(img_name, 'wb') as f: f.write(img) return img_name # RETURNING KITTEN LOCATION def get_kittens(): soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') links = soup.findAll('a', {'class': 'wallpapers__link'}) kittens = [] # CREATE KITTEN LIST for link in links: url = 'https://wallpaperscraft.com{}'.format(link['href']) soup = lovely_soup(url) url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href'] url = 'https://wallpaperscraft.com{}'.format(url) soup = lovely_soup(url) url = soup.find('a', {'class': 'gui-button_full-height'})['href'] kittens.append(get_img(url)) # COLLECTING KITTENS return kittens # RETURNING ALL KITTEN LOCATIONS def get_headlines(kittens): soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/') headlines = soup.find('div', {'class': 'blog-single-inner-cont'}) headlines = headlines.findAll('strong') data = {} for headline, kitten in zip(headlines[1:], kittens): # LOOP THROUGH KITTENS AND HEADLINES line = headline.text line = line.lstrip('0123456789.- ').strip() print(line) data.update({line: kitten}) with open('data.json', 'w+') as f: json.dump(data, f) kittens = get_kittens() # GET KITTENS get_headlines(kittens) # PLAY WITH KITTENS

And... Voila! A script which downloads 15 images of kittens along with bad headlines and dumps the data into a json file as image : headline for each entry. Just what we all need. Lovely!

from bs4 import BeautifulSoup import requests from fake_useragent import UserAgent import json ua = UserAgent() def lovely_soup(url): r = requests.get(url, headers={'User-Agent': ua.chrome}) return BeautifulSoup(r.text, 'lxml') def get_img(get_url): if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')): img_name = 'images/{}'.format(get_url.split('/')[-1]) img = get(get_url).content with open(img_name, 'wb') as f: f.write(img) return img_name def get_kittens(): soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') links = soup.findAll('a', {'class': 'wallpapers__link'}) kittens = [] for link in links: url = 'https://wallpaperscraft.com{}'.format(link['href']) soup = lovely_soup(url) url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href'] url = 'https://wallpaperscraft.com{}'.format(url) soup = lovely_soup(url) url = soup.find('a', {'class': 'gui-button_full-height'})['href'] kittens.append(get_img(url)) return kittens def get_headlines(kittens): soup = lovely_soup('https://www.copypress.com/blog/40-headlines-the-good-the-bad-and-the-ugly/') headlines = soup.find('div', {'class': 'blog-single-inner-cont'}) headlines = headlines.findAll('strong') data = {} for headline, kitten in zip(headlines[1:], kittens): line = headline.text line = line.lstrip('0123456789.- ').strip() data.update({line: kitten}) with open('data.json', 'w+') as f: json.dump(data, f) kittens = get_kittens() get_headlines(kittens)

This is the contents of the JSON file.

{"Eminem Terrified As Daughter Begins Dating Man Raised On His Music": "images/kitten_face_window_fluffy_92898_2046x1333.jpg", "Ways to Make Money While Waiting for Disability Benefits": "images/kitten_ball_thread_white_background_95135_3000x2399.jpg", "How to Have a Healthier and More Productive Home Office": "images/kitten_lying_striped_small_cute_102741_3872x2592.jpg", "A Little Mistake That Cost a Farmer $3,000 a Year": "images/kitten_sleeping_baby_striped_89331_1600x1200.jpg", "Are You Making These Embarrassing Mistakes at Work?": "images/kitten_cat_grass_123220_4272x2848.jpg", "Lose 8 Pounds in 2 Weeks": "images/kitten_fluffy_face_rose_grass_look_85837_2048x1432.jpg", "How Many of These Italian Foods Have You Tried?": "images/kitten_fluffy_look_95571_1920x1200.jpg", "What\u2019s Scarier Than the Sex Talk? Talking About Food & Weight": "images/kitten_briton_look_kid_96414_3000x2000.jpg", "More Than Half of Medical Advice on \u2018Dr. Oz\u2019 Lacks Proof or Contradicts Best Available Science": "images/kitten_cat_computer_keyboard_apple_mac_black_and_white_94218_1920x1280.jpg", "Lack Time? Here Are 4 Convenient Ways to Keep Your Dog Fit": "images/kitten_cat_playful_down_paw_95904_2560x1600.jpg", "How One Stupid Tweet Blew Up Justine Sacco\u2019s Life": "images/kitten_dark_lying_71730_1920x1200.jpg", "10 Signs That You Will NOT Make It As A Successful Photographer": "images/kitten_fluffy_grass_flowers_run_96294_2048x1365.jpg", "Sure-Fire Ways to Ruin Your Marriage": "images/kitten_look_surprise_striped_85821_1920x1538.jpg", "10 Different Types of Girlfriends \u2013 Which One Are You?": "images/kitten_protruding_tongue_photoshoot_pillow_97666_2048x1365.jpg", "More of Us May Be \u201cAlmost Alcoholics\u201d": "images/kitten_fluffy_look_kid_97036_3393x2248.jpg"}

This is how you parse it.

import json with open('data.json') as f: data = json.load(f) for item in data: print(item) print(data.get(item))

But what to do with it?

Tweet it

import json import tweepy from random import randint consumer_key = 'XXXX' consumer_secret = 'XXXX' access_key = 'XXXX' access_secret = 'XXXX' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) def tweet_kitten(): with open('data.json') as f: data = json.load(f) pick = randint(1, len(data)) c = 0 for item in data: c += 1 if c == pick: api.update_with_media(data.get(item), item) return True tweet_kitten()

Instaspam it

from InstagramAPI import InstagramAPI from random import randint import json instauser = 'XXXX' instapass = 'XXXX' api = InstagramAPI(instauser, instapass) api.login() with open('data.json') as f: data = json.load(f) pick = randint(1, len(data)) c = 0 for item in data: c += 1 if c == pick: img = data.get(item) api.uploadPhoto(img, caption=item) api.logout()

Build a website

<!DOCTYPE html> <html> <head> <meta charset="utf-8"> <meta name="viewport" content="width=device-width, initial-scale=1"> <title>Playing with BeautifulSoup</title> <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/bulma/0.7.2/css/bulma.min.css"> <style> .kittens { padding-top: 3em; padding-bottom: 3em; } .image img { object-fit: cover; } .line-clamp { display: block; display: -webkit-box; -webkit-box-orient: vertical; position: relative; line-height: 1.2; overflow: hidden; text-overflow: ellipsis; padding: 0 !important; } .line-clamp:after { content: '...'; text-align: right; bottom: 0; right: 0; width: 25%; display: block; position: absolute; height: calc(1em * 1.2); background: linear-gradient(to right, rgba(255, 255, 255, 0), rgba(255, 255, 255, 1) 75%); } @supports (-webkit-line-clamp: 1) { .line-clamp:after { display: none !important; } } .line-clamp-2 { -webkit-line-clamp: 2; height: calc(1em * 1.2 * 2); } .footer { padding-top: 6em; padding-bottom: 6em; } .footer span { text-decoration: line-through; } </style> </head> <body> <section class="hero is-black is-medium"> <div class="hero-body"> <div class="container"> <h1 class="title">Web Scraping With Python</h1> <h2 class="subtitle">Playing with BeautifulSoup</h2> </div> </div> </section> <section class="section"> <div class="container"> <div class="columns is-multiline kittens"></div> </div> </section> <footer class="footer"> <div class="content has-text-centered"> <p>Made with <span>love</span> a keyboard</p> </div> </footer> <script src="https://code.jquery.com/jquery-3.3.1.min.js"></script> <script> $.getJSON("data.json", function(data) { $.each(data, function(title, image) { $('.kittens').append( `<div class='column is-4'> <div class="card"> <div class="card-image"> <figure class="image is-4by3"> <img src="${image}" alt="${title}"> </figure> </div> <div class="card-content"> <div class="media"> <div class="media-content"> <p class="title is-6 line-clamp line-clamp-2">${title}</p> </div> </div> </div> </div> </div>` ); }); }); </script> </body> </html>

Stealing 743 kittens

DISCLAIMER - As always... This is an example and is more than likely against all the rules. Use at your own risk!

I was asked if I could cover downloading all the kitten images instead of just the first page.

from bs4 import BeautifulSoup from requests import get from fake_useragent import UserAgent ua = UserAgent() def lovely_soup(u): h = {'User-Agent': ua.chrome} r = get(u, headers=h) c = r.text return BeautifulSoup(c, 'lxml') def get_img(get_url): if get_url.endswith(('.jpg', '.jpeg', '.png', '.gif')): img_name = 'images/{}'.format(get_url.split('/')[-1]) img = get(get_url).content with open(img_name, 'wb') as f: f.write(img) return img_name def get_all_kittens(): soup = lovely_soup('https://wallpaperscraft.com/tag/kitten') total = soup.find('li', {'class': 'pager__item pager__item_last-page'}).find('a')['href'] total = int(''.join(filter(lambda x: x.isdigit(), total))) + 1 kittens = [] for i in range(1, int(total)): soup = lovely_soup(f'https://wallpaperscraft.com/tag/kitten/page{i}') links = soup.findAll('a', {'class': 'wallpapers__link'}) for link in links: url = 'https://wallpaperscraft.com{}'.format(link['href']) soup = lovely_soup(url) url = soup.findAll('span', {'class': 'wallpaper-table__cell'})[1].find('a')['href'] url = 'https://wallpaperscraft.com{}'.format(url) soup = lovely_soup(url) url = soup.find('a', {'class': 'gui-button_full-height'})['href'] kittens.append(get_img(url)) print(len(kittens)) if __name__ == '__main__': get_all_kittens()

Thanks for reading. x

Resources