Skip to content

Grab

Simple API

Get

from grab import Grab

g = Grab()
resp = g.go('https://openaccess.thecvf.com/CVPR2020?day=2020-06-16')
# g.doc == resp -> True

Post

from grab import Grab

g = Grab()

g.go('https://github.com/login')
g.doc.set_input('login', '****')
g.doc.set_input('password', '****')
g.doc.submit()

g.doc.save('/tmp/x.html')
home_url = g.doc('//a[contains(@class, "header-nav-link name")]/@href').text()

Handle Response

resp.url
resp.code

resp.body
resp.unicode_body()

resp.cookies

Selector

selectorList = g.doc('/xpath') # g.doc.select('/xpath')
selector = selectorList[0]

selector.html()
selector.text()

Example

crawl videos in yhdm.so .

def crawl(keyword):
    urls = []
    g = Grab()
    g.go(f'http://www.yhdm.so/search/{keyword}/')
    candidate_selectors = g.doc(f'//div[@class="lpic"]/ul/li/a/@href')
    for candidate in candidate_selectors:
        g.go(f'http://www.yhdm.so{candidate.text()}')
        episode_selectors = g.doc('//div[@class="movurl"]/ul/li/a/@href')
        for episode in episode_selectors:
            g.go(f'http://www.yhdm.so{episode.text()}')
            title_selectors = g.doc('//div[@class="gohome l"]/h1')
            title = title_selectors[0].text()
            data_selectors = g.doc('//div[@id="playbox"]/@data-vid')
            url = data_selectors[0].text()
            if url[-4:] == "$mp4":
                url = url[:-4]
                print(f'[INFO] crawled {title} {url}')
                urls.append({
                    'formattedUrl': url,
                    'title': title,
                    'snipped': '',
                })
    return urls

Spider API

from grab.spider import Spider, Task

class ExampleSpider(Spider):
    def task_generator(self):
        for lang in 'python', 'ruby', 'perl':
            url = 'https://www.google.com/search?q=%s' % lang
            yield Task('search', url=url, lang=lang)

    def task_search(self, grab, task):
        print('%s: %s' % (task.lang,
                          grab.doc('//div[@class="s"]//cite').text()))


bot = ExampleSpider(thread_number=2)
bot.run()