Monthly Archives: March 2011

Firecleaner: a python3 script to remove duplicate Firefox bookmark entries

If you are like me, using multiple internet browsers such as IE8/9, FireFox3/4, Google Chrome on both Windows XP/7 and Ubuntu, You are going to find Xmarks  indispensible.The only pain is that bookmarks may be duplicated many times.
I finally could not bear it and come up with this solution with my favorite  script language python:
1, export bookmarks from Firefox
2, run the below script on the exported file and get an out.html file
3. delete all entries in the Firefox bookmark menu(NOT bookmark toolbar)
4. import the out.html file
Running it on my bookmarks, I get:

"Of 5142 entries scanned, 4014 are duplicated and removed"

Nice.
 
# -*- coding: utf-8 -*-

class fireclean:
    
    """
        this script clean up duplicated firefox bookmarks entries
    """
    
    def __init__(self, fin = "bookmarks.html", fout = "out.html", flog = "leaveout.html"):
    
        """
            fin:  path to the original bookmark file exported from firefox
            fout: path to the generated file
            flog: path to the file containing removed lines
        """

        self.fin  = open(fin, encoding = 'utf-8')
        self.fout = open(fout, mode = 'w' , encoding = 'utf-8') 
        self.flog = open(flog, mode = 'w' , encoding = 'utf-8') 
        
        self.href_list   = []
        self.header_list = []
        
        self.total_lines   = 0    # line of <DT>s
        self.removed_lines = 0    # line of removed <DT>s
    
    def remove_u200b(self, s):
        
        """
            for some reason, U+200B is found in lines, probably inserted by XMarks
            it needs to be removed before checking duplicates
        """
        
        return s.replace('\u200b', '')
        

    def check_line(self, line, test, container):

        """
            parameters:
                line: the raw line to be checked
                test: substring of line to be tested
                container: container of tested strings

            algorithm:
                if test is found in container,
                then line is removed and written to flog
                otherwise to fout
        """
        
        is_duplicate = False
         
        print(test)
        
        if  not test in container:
            container.append(test)
            self.fout.write(line)
        else:
            self.flog.write(line)      
            is_duplicate = True
        
        return is_duplicate

    def clean(self, need_log = False):
    
        """
            parameters:
                need_log: print the number of scanned and removed entries if True

            algorithm:
                check duplicate HREFs for entries shown as items in FireFox bookmark manager
                check duplicate element names for entries shown as folders in FireFox bookmark manager
        """

        for line in self.fin:
            self.total_lines += 1
            line_ = self.remove_u200b(line).strip()

            if(line_.startswith('<DT><A')): # for bookmark items
                
                self.removed_lines += self.check_line(line, line_.split(' ', 2)[1], self.href_list)
                    
            elif(line_.startswith('<DT><H3')):   # for bookmark folders
                
                self.removed_lines += self.check_line(line, line_.rsplit('>', 2)[-2], self.header_list)
                    
            else:
                self.total_lines -= 1
                self.fout.write(line)
                
        self.fout.close()
        self.flog.close()
        
        if need_log:
            print("Of {0} entries scanned, {1} are duplicated and removed".format(self.total_lines, self.removed_lines ))
    
import sys
if __name__ == "__main__":
    if(len(sys.argv) > 1):
        fireclean(sys.argv[1]).clean(True)
    else:
        fireclean().clean(True)        

=-=-=-=-=

Powered by Blogilo

Advertisements