第七色在线视频,2021少妇久久久久久久久久,亚洲欧洲精品成人久久av18,亚洲国产精品特色大片观看完整版,孙宇晨将参加特朗普的晚宴

為了賬號(hào)安全,請(qǐng)及時(shí)綁定郵箱和手機(jī)立即綁定

如何用python爬取知乎話題?

標(biāo)簽:
Python

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用


#coding:utf-8

"""

@author:haoning

@create time:2015.8.5

"""

from __future__ import division  # 精确除法

from Queue import Queue

from __builtin__ import False

import json

import os

import re

import platform

import uuid

import urllib

import urllib2

import sys

import time

import MySQLdb as mdb

from bs4 import BeautifulSoup

reload(sys)

sys.setdefaultencoding( "utf-8" )

headers = {

   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',

   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',

   'X-Requested-With':'XMLHttpRequest',

   'Referer':'https://www.zhihu.com/topics',

   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'

}

DB_HOST = '127.0.0.1'

DB_USER = 'root'

DB_PASS = 'root'

queue= Queue() #接收队列

nodeSet=set()

keywordSet=set()

stop=0

offset=-20

level=0

maxLevel=7

counter=0

base=""

conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')

conn.autocommit(False)

curr = conn.cursor()

def get_html(url):

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req,None,3) #在这里应该加入代理

        html = response.read()

        return html

    except:

        pass

    return None

def getTopics():

    url = 'https://www.zhihu.com/topics'

    print url

    try:

        req = urllib2.Request(url)

        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�

        html = response.read().decode('utf-8')

        print html

        soup = BeautifulSoup(html)

        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})

        for li in lis:

            data_id=li.get('data-id')

            name=li.text

            curr.execute('select id from classify_new where name=%s',(name))

            y= curr.fetchone()

            if not y:

                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))

        conn.commit()

    except Exception as e:

        print "get topic error",e

def get_extension(name):  

    where=name.rfind('.')

    if where!=-1:

        return name[where:len(name)]

    return None

def which_platform():

    sys_str = platform.system()

    return sys_str

def GetDateString():

    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))

    foldername = str(when)

    return foldername 

def makeDateFolder(par,classify):

    try:

        if os.path.isdir(par):

            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)

            if which_platform()=="Linux":

                newFolderName=par + '/' + GetDateString() + "/" +str(classify)

            if not os.path.isdir( newFolderName ):

                os.makedirs( newFolderName )

            return newFolderName

        else:

            return None 

    except Exception,e:

        print "kk",e

    return None 

def download_img(url,classify):

    try:

        extention=get_extension(url)

        if(extention is None):

            return None

        req = urllib2.Request(url)

        resp = urllib2.urlopen(req,None,3)

        dataimg=resp.read()

        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention

        top="E://topic_pic"

        folder=makeDateFolder(top, classify)

        filename=None

        if folder is not None:

            filename  =folder+"http://"+name

        try:

            if "e82bab09c_m" in str(url):

                return True

            if not os.path.exists(filename):

                file_object = open(filename,'w+b')

                file_object.write(dataimg)

                file_object.close()

                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name

            else:

                print "file exist"

                return None

        except IOError,e1:

            print "e1=",e1

            pass

    except Exception as e:

        print "eee",e

        pass

    return None #如果没有下载下来就利用原来网站的链接

def getChildren(node,name):

    global queue,nodeSet

    try:

        url="https://www.zhihu.com/topic/"+str(node)+"/hot"

        html=get_html(url)

        if html is None:

            return

        soup = BeautifulSoup(html)

        p_ch='父话题'

        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

        topic_cla=soup.find('div', {'class' : 'child-topic'})

        if topic_cla is not None:

            try:

                p_ch=str(topic_cla.text)

                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点

                if u'子话题' in p_ch:

                    for a in aList:

                        token=a.get('data-token')

                        a=str(a).replace('\n','').replace('\t','').replace('\r','')

                        start=str(a).find('>')

                        end=str(a).rfind('</a>')

                        new_node=str(str(a)[start+1:end])

                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同

                        y= curr.fetchone()

                        if not y:

                            print "y=",y,"new_node=",new_node,"token=",token

                            queue.put((token,new_node,node_name))

            except Exception as e:

                print "add queue error",e

    except Exception as e:

        print "get html error",e

def getContent(n,name,p,top_id):

    try:

        global counter

        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

        y= curr.fetchone()

        print "exist?? ",y,"n=",n

        if not y:

            url="https://www.zhihu.com/topic/"+str(n)+"/hot"

            html=get_html(url)

            if html is None:

                return

            soup = BeautifulSoup(html)

            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h1').text

            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')

            description=soup.find('div',{'class':'zm-editable-content'})

            if description is not None:

                description=description.text

            if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环

                description=None

            tag_path=download_img(pic_path,top_id)

            print "tag_path=",tag_path

            if (tag_path is not None) or tag_path==True:

                if tag_path==True:

                    tag_path=None

                father_id=2 #默认为杂谈

                curr.execute('select id from rooms where name=%s',(p))

                results = curr.fetchall()

                for r in results:

                    father_id=r[0]

                name=title

                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同

                y= curr.fetchone()

                print "store see..",y

                if not y:

                    friends_num=0

                    temp = time.time()

                    x = time.localtime(float(temp))

                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now

                    create_time

                    creater_id=None

                    room_avatar=tag_path

                    is_pass=1

                    has_index=0

                    reason_id=None  

                    #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id

                    ######################有资格入库的内容

                    counter=counter+1

                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))

                    conn.commit() #必须时时进入数据库,不然找不到父节点

                    if counter % 200==0:

                        print "current node",name,"num",counter

    except Exception as e:

        print "get content error",e       

def work():

    global queue

    curr.execute('select id,node,parent,name from classify where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        node=r[1]

        parent=r[2]

        name=r[3]

        try:

            queue.put((node,name,parent)) #首先放入队列

            while queue.qsize() >0:

                n,p=queue.get() #顶节点出队

                getContent(n,p,top_id)

                getChildren(n,name) #出队内容的子节点

            conn.commit()

        except Exception as e:

            print "what's wrong",e  

def new_work():

    global queue

    curr.execute('select id,data_id,name from classify_new_copy where status=1')

    results = curr.fetchall()

    for r in results:

        top_id=r[0]

        data_id=r[1]

        name=r[2]

        try:

            get_topis(data_id,name,top_id)

        except:

            pass

def get_topis(data_id,name,top_id):

    global queue

    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'

    isGet = True;

    offset = -20;

    data_id=str(data_id)

    while isGet:

        offset = offset + 20

        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}

        try:

            msg=None

            try:

                data = urllib.urlencode(values)

                request = urllib2.Request(url,data,headers)

                response = urllib2.urlopen(request,None,5)

                html=response.read().decode('utf-8')

                json_str = json.loads(html)

                ms=json_str['msg']

                if len(ms) <5:

                    break

                msg=ms[0]

            except Exception as e:

                print "eeeee",e

            #print msg

            if msg is not None:

                soup = BeautifulSoup(str(msg))

                blks = soup.find_all('div', {'class' : 'blk'})

                for blk in blks:

                    page=blk.find('a').get('href')

                    if page is not None:

                        node=page.replace("/topic/","") #将更多的种子入库

                        parent=name

                        ne=blk.find('strong').text

                        try:

                            queue.put((node,ne,parent)) #首先放入队列

                            while queue.qsize() >0:

                                n,name,p=queue.get() #顶节点出队

                                size=queue.qsize()

                                if size > 0:

                                    print size

                                getContent(n,name,p,top_id)

                                getChildren(n,name) #出队内容的子节点

                            conn.commit()

                        except Exception as e:

                            print "what's wrong",e  

        except urllib2.URLError, e:

            print "error is",e

            pass 

if __name__ == '__main__':

    i=0

    while i<400:

        new_work()

        i=i+1

點(diǎn)擊查看更多內(nèi)容
TA 點(diǎn)贊

若覺得本文不錯(cuò),就分享一下吧!

評(píng)論

作者其他優(yōu)質(zhì)文章

正在加載中
  • 推薦
  • 評(píng)論
  • 收藏
  • 共同學(xué)習(xí),寫下你的評(píng)論
感謝您的支持,我會(huì)繼續(xù)努力的~
掃碼打賞,你說多少就多少
贊賞金額會(huì)直接到老師賬戶
支付方式
打開微信掃一掃,即可進(jìn)行掃碼打賞哦
今天注冊(cè)有機(jī)會(huì)得

100積分直接送

付費(fèi)專欄免費(fèi)學(xué)

大額優(yōu)惠券免費(fèi)領(lǐng)

立即參與 放棄機(jī)會(huì)
微信客服

購課補(bǔ)貼
聯(lián)系客服咨詢優(yōu)惠詳情

幫助反饋 APP下載

慕課網(wǎng)APP
您的移動(dòng)學(xué)習(xí)伙伴

公眾號(hào)

掃描二維碼
關(guān)注慕課網(wǎng)微信公眾號(hào)

舉報(bào)

0/150
提交
取消