Teratail that is often taken care of (https://teratail.com) Occasionally there is a guy who has been left playing for years without being answered at all. I wonder what there are many in terms of categories (tags) I'm thinking of doing good scraping.
What I noticed there is that the last page always ends with 500. The same 500th page is displayed even if you click the "Next" button. It's an endless loop. So I'm going to scrape in a way to avoid it.
This time I will use selenium with Python 3.7.
--Access the unanswered URL "https://teratail.com/feed/not-answered/
No_answered_Tags.py
def main():
    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from selenium.common.exceptions import TimeoutException
    
    options = Options()
    options.add_argument('--headless')
    browser = webdriver.Chrome(executable_path='/Users/anatanonamae/Desktop/Tool/chromedriver', chrome_options=options)
    browser.implicitly_wait(3)
    #Access the first page
    PAGE = 1
    InitURL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
    browser.get(InitURL)
    print("I accessed the first page")
    #Information gathering on each page
    TAG_DIC={}    
    while True:
        A_TAG = browser.find_elements_by_tag_name("a")#collect a tag
        
        taglist=[]
        for TAG in A_TAG :
            HREF = TAG.get_attribute('href') #Collect href
              
            if "tags" in str(HREF):#Collect hrefs containing tags
                if not TAG.text:
                    continue                        
                else:
                     taglist.append(TAG.text)
        for tag in taglist:
            if tag in TAG_DIC:
                 TAG_DIC[tag] += 1
            else:
                TAG_DIC[tag] = 1
            
        NEXT_XPATH = browser.find_elements_by_xpath("//*[@id=\"mainContainer\"]/div[4]/div/p/a/span[contains(text(),\'Following page\')]")
        if NEXT_XPATH:#Add PAGE if there is next
            PAGE += 1
        else:
            print("Got tags at last page.")#If not, it's over
            break
        browser.get(URL)#Go to next page
        WebDriverWait(browser, 2).until(EC.presence_of_all_elements_located)
        print(browser.current_url)
        if browser.title == "Page Not Found":
            print("Got tags at last page.")#If an error occurs on the next page, the process ends.
            break
 
    #Post-processing: Creating a Dataframe
    df = pd.DataFrame([TAG_DIC.keys(),TAG_DIC.values()],index=None).T#Convert to Dataframe
    df.rename(columns={0:"Tag",1:"Count"},inplace =True)#Rename column
    df.sort_values(by=['Count'],ascending=False,inplace =True)#Sort in descending order
    df.reset_index(drop=True,inplace=True)#Reassign index
        
    print(df)
if __name__ == "__main__":
    main()
selenium.py
    options = Options()#Selenium option settings
    options.add_argument('--headless')#Don't open the window
    browser = webdriver.Chrome(executable_path='/Users/anatanonamae/Desktop/Tool/chromedriver', chrome_options=options)#Call driver and set options
    browser.implicitly_wait(3)#Wait time setting
access.py
    #Access the first page
    PAGE = 1
    InitURL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
    browser.get(InitURL)#Access with get
    print("I accessed the first page")#browser.current_You can also view the current page with a url.
find_elements_by_tag_name
--Select the ones that contain "tag" in the href.WebDriverWait (browser, 2) .until (EC.presence_of_all_elements_located) is the same usage as sleep but more powerful. You can instruct to wait until the page is acquired properly.
For more information: https://qiita.com/uguisuheiankyo/items/cec03891a86dfda12c9a
loop.py
    #Information gathering on each page
    TAG_DIC={}    
    while True:
        A_TAG = browser.find_elements_by_tag_name("a")#collect a tag
        
        taglist=[]
        for TAG in A_TAG :
            HREF = TAG.get_attribute('href') #Collect href
              
            if "tags" in str(HREF):#Collect hrefs containing tags
                if not TAG.text:#Skip if blank
                    continue                        
                else:
                     taglist.append(TAG.text)
        for tag in taglist:
            if tag in TAG_DIC:
                 TAG_DIC[tag] += 1#Add if the tag exists
            else:
                TAG_DIC[tag] = 1#If not, register a new one and set the initial value to 1.
            
        NEXT_XPATH = browser.find_elements_by_xpath("//*[@id=\"mainContainer\"]/div[4]/div/p/a/span[contains(text(),\'Following page\')]")#「Following page」が含まれるelementを検索
        if NEXT_XPATH:#Add PAGE if there is next
            PAGE += 1
        else:
            print("Got tags at last page.")#If not, it's over
            break
        URL= "https://teratail.com/search?tab=active&page=" + str(PAGE) + "&q=is%3Anot-answered"
        browser.get(URL)#Go to next page
        WebDriverWait(browser, 2).until(EC.presence_of_all_elements_located)
        print(browser.current_url)
break.py
        if browser.title == "Page Not Found":
            print("Got tags at last page.")#If an error occurs on the next page, the process ends.
            break
.T in pd.DataFrame ([TAG_DIC.keys (), TAG_DIC.values ()], index = None) .T. Convenient Convenient.pandas.py
    #Post-processing: Creating a Dataframe
    df = pd.DataFrame([TAG_DIC.keys(),TAG_DIC.values()],index=None).T#Convert to Dataframe
    df.rename(columns={0:"Tag",1:"Count"},inplace =True)#Rename column
    df.sort_values(by=['Count'],ascending=False,inplace =True)#Sort in descending order
    df.reset_index(drop=True,inplace=True)#Reassign index
        
    print(df)
With that feeling, the result is ...
result.py
Got tags at last page.
                        Tag Count
0                       PHP  3139
1                    Python  2623
2                JavaScript  2428
3                      Ruby  1974
4                Python 3.x  1762
5                 WordPress  1563
・
・
[1369 rows x 2 columns]
・
・
A whopping 1369 lines came out. "Tag list 501" or the one with a count of 1 or picking up trash The cause was that there were quite a lot. It was okay to delete lines with a count of 100 or less in post-processing. If you want to make it more beautiful, do not register excluded words in the dictionary I think it's okay to make a conditional branch.
That's all for this time.
Recommended Posts