utf 8 - Python 3.x: 'ascii' codec can't encode character '\xfc' in position 18: ordinal not in range(128) -

April 15, 2010

i alraedy checked existing questions. none of them works me.

i wrote code scrape information multiple pages in website.

when run code, returns error: 'ascii' codec can't encode character '\xfc' in position 18: ordinal not in range(128)

when test code on limited number of links works.
problem link:

'https://www.crowdcube.com/investment/brüpond-brewery-10622'

because there ü

in specific case, can drop link , ok. know how handle problem in general.

here there code

from bs4 import beautifulsoup import urllib  time import sleep  import re    def make_soup(url):     html = urllib.request.urlopen(url)     return beautifulsoup(html, "lxml")  def get_links(section_url):     get_link = make_soup(section_url)     links_page = [a.attrs.get('href') in get_link.select('a[href]')]     links_page = list(set(links_page))       links = [l l in links_page if 'https://www.crowdcube.com/investment/' in l]       return links  def get_data(url): miss='.' tree= make_soup(url) try:     #title     title = tree.find_all('h2')[0].get_text()      #description     description=tree.find_all('div',{'class':'fullwidth'})     description= description[1].find('p').get_text()     description=re.sub(r'[^\w.]', ' ', description)        #location     location=tree.find_all('div',{'class':'pitch-profile'})     location=location[0].find('li').get_text()     l=0     loc=list(location)     while l < len(loc):        if loc[l]==',':            loc[l]='-'        l+=1        del(loc[0:10])     location="".join(loc)    #raised capital     raised=tree.find_all('div',{'class':'cc-pitch__raised'})     raised= raised[0].find('b').get_text()      rais=list(raised)      r=0     while r < len(rais):         if rais[r]==',':            rais[r]='.'         r+=1         currency=rais[0]      del(rais[0])      raised="".join(rais)      #target     target=tree.find_all('div',{'class':'cc-pitch__stats clearfix'})     target= target[0].find('dd').get_text()     targ=list(target)      t=0     while t < len(targ):         if targ[t]==',':            targ[t]='.'         t+=1          del(targ[0])      target="".join(targ)      #category      category=tree.find_all('li',{'class':'sectors'})     category=category[0].find('span').get_text()      category=category.strip()     category=category.replace(" ", "")     cat=list(category)         c=0     while c < len(cat):         if cat[c]==',':            cat[c]='-'         c+=1        category="".join(cat)      backers=tree.find_all('div',{'class':'cc-pitch__stats clearfix'})      tag in backers:         ddtags = tag.find_all("dd")       backers= ddtags[3].get_text()       backers      return {"url": url.encode("utf-8"),             "title": title.encode("utf-8"),             "backers":backers.encode("utf-8"),             "description":description.encode("utf-8"),             "location":location.encode("utf-8"),             "raised": raised.encode("utf-8"),             "currency":currency.encode("utf-8"),             "target": target.encode("utf-8"),             "category": category.encode("utf-8")} except(indexerror,runtimeerror, typeerror, nameerror, unicodeencodeerror):     return {"url": url,             "title": miss,             "backers":miss,             "description":miss,             "location":miss,             "raised": miss,             "currency":miss,             "target": miss,             "category": miss}   if __name__ == '__main__':     start_url = ("https://www.crowdcube.com/investments?sort_by=0&q=&hof=1&i1=0&i2=0&i3=0&i4=0&sort_by=7")  links = get_links(start_url)  data = [] # list store our dictionaries link in links:     crowdcube = get_data(link)     data.append(crowdcube)     sleep(1)

any suggestion? in advance

urllib can't handle umlauts 'ü' in in url:

'https://www.crowdcube.com/investment/brüpond-brewery-10622'

use requests lib. requests lib has no problems umlauts.

for example change make_soup function this:

import requests  def make_soup(url):     html = requests.get(url).text     return beautifulsoup(html, "lxml")

Search This Blog

Color

utf 8 - Python 3.x: 'ascii' codec can't encode character '\xfc' in position 18: ordinal not in range(128) -

Comments

Post a Comment

Popular posts from this blog

Redirect to a HTTPS version using .htaccess -

Unlimited choices in BASH case statement -

javascript - jQuery: Add class depending on URL in the best way -