2017-09-19 13:42:09 +03:00
#!/usr/bin/env python2
2005-12-09 17:41:48 +03:00
#
# imports the API description and fills up a database with
# name relevance to modules, functions or web pages
#
# Operation needed:
# =================
#
# install mysqld, the python wrappers for mysql and libxml2, start mysqld
# - mysql-server
# - mysql
# - php-mysql
# - MySQL-python
# Change the root passwd of mysql:
# mysqladmin -u root password new_password
# Create the new database libvir
# mysqladmin -p create libvir
2008-05-15 10:12:32 +04:00
# Create a database user 'veillard' and give him password access
2005-12-09 17:41:48 +03:00
# change veillard and abcde with the right user name and passwd
# mysql -p
# password:
# mysql> GRANT ALL PRIVILEGES ON libvir TO veillard@localhost
# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
# mysql> GRANT ALL PRIVILEGES ON libvir.* TO veillard@localhost
# IDENTIFIED BY 'abcde' WITH GRANT OPTION;
#
# As the user check the access:
# mysql -p libvir
# Enter password:
# Welcome to the MySQL monitor....
# mysql> use libvir
# Database changed
# mysql> quit
# Bye
#
# Then run the script in the doc subdir, it will create the symbols and
2008-03-14 14:08:03 +03:00
# word tables and populate them with information extracted from
2006-02-25 00:17:54 +03:00
# the libvirt-api.xml API description, and make then accessible read-only
2005-12-09 17:41:48 +03:00
# by nobody@loaclhost the user expected to be Apache's one
#
# On the Apache configuration, make sure you have php support enabled
#
import MySQLdb
import libxml2
import sys
import string
import os
#
# We are not interested in parsing errors here
#
def callback ( ctx , str ) :
return
libxml2 . registerErrorHandler ( callback , None )
#
2008-05-15 10:12:32 +04:00
# The dictionary of tables required and the SQL command needed
2005-12-09 17:41:48 +03:00
# to create them
#
2018-03-20 09:48:47 +03:00
TABLES = {
2018-03-20 09:48:46 +03:00
" symbols " : """ CREATE TABLE symbols (
2005-12-09 17:41:48 +03:00
name varchar ( 255 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
module varchar ( 255 ) BINARY NOT NULL ,
2005-12-09 17:41:48 +03:00
type varchar ( 25 ) NOT NULL ,
2011-02-16 18:57:50 +03:00
descr varchar ( 255 ) ,
UNIQUE KEY name ( name ) ,
KEY module ( module ) ) """ ,
2018-03-20 09:48:46 +03:00
" words " : """ CREATE TABLE words (
2005-12-09 17:41:48 +03:00
name varchar ( 50 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
symbol varchar ( 255 ) BINARY NOT NULL ,
2005-12-09 17:41:48 +03:00
relevance int ,
2011-02-16 18:57:50 +03:00
KEY name ( name ) ,
KEY symbol ( symbol ) ,
UNIQUE KEY ID ( name , symbol ) ) """ ,
2018-03-20 09:48:46 +03:00
" wordsHTML " : """ CREATE TABLE wordsHTML (
2005-12-09 17:41:48 +03:00
name varchar ( 50 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
resource varchar ( 255 ) BINARY NOT NULL ,
section varchar ( 255 ) ,
id varchar ( 50 ) ,
2005-12-09 17:41:48 +03:00
relevance int ,
2011-02-16 18:57:50 +03:00
KEY name ( name ) ,
KEY resource ( resource ) ,
UNIQUE KEY ref ( name , resource ) ) """ ,
2018-03-20 09:48:46 +03:00
" wordsArchive " : """ CREATE TABLE wordsArchive (
2005-12-09 17:41:48 +03:00
name varchar ( 50 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
ID int ( 11 ) NOT NULL ,
2005-12-09 17:41:48 +03:00
relevance int ,
2011-02-16 18:57:50 +03:00
KEY name ( name ) ,
UNIQUE KEY ref ( name , ID ) ) """ ,
2018-03-20 09:48:46 +03:00
" pages " : """ CREATE TABLE pages (
2005-12-09 17:41:48 +03:00
resource varchar ( 255 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
title varchar ( 255 ) BINARY NOT NULL ,
UNIQUE KEY name ( resource ) ) """ ,
2018-03-20 09:48:46 +03:00
" archives " : """ CREATE TABLE archives (
2005-12-09 17:41:48 +03:00
ID int ( 11 ) NOT NULL auto_increment ,
resource varchar ( 255 ) BINARY NOT NULL ,
2011-02-16 18:57:50 +03:00
title varchar ( 255 ) BINARY NOT NULL ,
UNIQUE KEY id ( ID , resource ( 255 ) ) ,
INDEX ( ID ) ,
INDEX ( resource ) ) """ ,
2018-03-20 09:48:46 +03:00
" Queries " : """ CREATE TABLE Queries (
2005-12-09 17:41:48 +03:00
ID int ( 11 ) NOT NULL auto_increment ,
2011-02-16 18:57:50 +03:00
Value varchar ( 50 ) NOT NULL ,
Count int ( 11 ) NOT NULL ,
UNIQUE KEY id ( ID , Value ( 35 ) ) ,
INDEX ( ID ) ) """ ,
2018-03-20 09:48:46 +03:00
" AllQueries " : """ CREATE TABLE AllQueries (
2005-12-09 17:41:48 +03:00
ID int ( 11 ) NOT NULL auto_increment ,
2011-02-16 18:57:50 +03:00
Value varchar ( 50 ) NOT NULL ,
Count int ( 11 ) NOT NULL ,
UNIQUE KEY id ( ID , Value ( 35 ) ) ,
INDEX ( ID ) ) """ ,
2005-12-09 17:41:48 +03:00
}
#
# The XML API description file to parse
#
2018-03-20 09:48:47 +03:00
API = " libvirt-api.xml "
DB = None
2005-12-09 17:41:48 +03:00
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# MySQL database interfaces #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
def createTable ( db , name ) :
global TABLES
2013-08-22 13:16:03 +04:00
if db is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if name is None :
2005-12-09 17:41:48 +03:00
return - 1
c = db . cursor ( )
ret = c . execute ( " DROP TABLE IF EXISTS %s " % ( name ) )
if ret == 1 :
print " Removed table %s " % ( name )
print " Creating table %s " % ( name )
try :
ret = c . execute ( TABLES [ name ] )
except :
print " Failed to create table %s " % ( name )
2011-02-16 18:57:50 +03:00
return - 1
2005-12-09 17:41:48 +03:00
return ret
2018-03-20 09:48:54 +03:00
def checkTables ( db , verbose = 1 ) :
2005-12-09 17:41:48 +03:00
global TABLES
2013-08-22 13:16:03 +04:00
if db is None :
2005-12-09 17:41:48 +03:00
return - 1
c = db . cursor ( )
nbtables = c . execute ( " show tables " )
if verbose :
2011-02-16 18:57:50 +03:00
print " Found %d tables " % ( nbtables )
2005-12-09 17:41:48 +03:00
tables = { }
i = 0
while i < nbtables :
l = c . fetchone ( )
2011-02-16 18:57:50 +03:00
name = l [ 0 ]
tables [ name ] = { }
2005-12-09 17:41:48 +03:00
i = i + 1
for table in TABLES . keys ( ) :
if not tables . has_key ( table ) :
2011-02-16 18:57:50 +03:00
print " table %s missing " % ( table )
createTable ( db , table )
try :
2013-02-07 11:22:01 +04:00
ret = c . execute ( " SELECT count(*) from %s " % table )
2011-02-16 18:57:50 +03:00
row = c . fetchone ( )
if verbose :
print " Table %s contains %d records " % ( table , row [ 0 ] )
except :
2018-03-20 09:48:46 +03:00
print " Troubles with table %s : repairing " % ( table )
2013-02-07 11:22:01 +04:00
ret = c . execute ( " repair table %s " % table )
2011-02-16 18:57:50 +03:00
print " repairing returned %d " % ( ret )
2013-02-07 11:22:01 +04:00
ret = c . execute ( " SELECT count(*) from %s " % table )
2011-02-16 18:57:50 +03:00
row = c . fetchone ( )
print " Table %s contains %d records " % ( table , row [ 0 ] )
2005-12-09 17:41:48 +03:00
if verbose :
2011-02-16 18:57:50 +03:00
print " checkTables finished "
2005-12-09 17:41:48 +03:00
# make sure apache can access the tables read-only
try :
2011-02-16 18:57:50 +03:00
ret = c . execute ( " GRANT SELECT ON libvir.* TO nobody@localhost " )
ret = c . execute ( " GRANT INSERT,SELECT,UPDATE ON libvir.Queries TO nobody@localhost " )
2005-12-09 17:41:48 +03:00
except :
pass
return 0
2008-02-05 22:27:37 +03:00
2018-03-20 09:48:54 +03:00
def openMySQL ( db = " libvir " , passwd = None , verbose = 1 ) :
2005-12-09 17:41:48 +03:00
global DB
2013-08-22 13:16:03 +04:00
if passwd is None :
2005-12-09 17:41:48 +03:00
try :
2011-02-16 18:57:50 +03:00
passwd = os . environ [ " MySQL_PASS " ]
except :
print " No password available, set environment MySQL_PASS "
sys . exit ( 1 )
2005-12-09 17:41:48 +03:00
DB = MySQLdb . connect ( passwd = passwd , db = db )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
ret = checkTables ( DB , verbose )
return ret
def updateWord ( name , symbol , relevance ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if name is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return - 1
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" INSERT INTO words (name, symbol, relevance) VALUES ( ' %s ' , ' %s ' , %d ) """ %
2011-02-16 18:57:50 +03:00
( name , symbol , relevance ) )
2005-12-09 17:41:48 +03:00
except :
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" UPDATE words SET relevance = %d where name = ' %s ' and symbol = ' %s ' """ %
2011-02-16 18:57:50 +03:00
( relevance , name , symbol ) )
except :
print " Update word ( %s , %s , %s ) failed command " % ( name , symbol , relevance )
print " UPDATE words SET relevance = %d where name = ' %s ' and symbol = ' %s ' " % ( relevance , name , symbol )
print sys . exc_type , sys . exc_value
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ret
def updateSymbol ( name , module , type , desc ) :
global DB
updateWord ( name , name , 50 )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if name is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if module is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if type is None :
2005-12-09 17:41:48 +03:00
return - 1
try :
2011-02-16 18:57:50 +03:00
desc = string . replace ( desc , " ' " , " " )
l = string . split ( desc , " . " )
desc = l [ 0 ]
desc = desc [ 0 : 99 ]
2005-12-09 17:41:48 +03:00
except :
desc = " "
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" INSERT INTO symbols (name, module, type, descr) VALUES ( ' %s ' , ' %s ' , ' %s ' , ' %s ' ) """ %
( name , module , type , desc ) )
except :
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" UPDATE symbols SET module= ' %s ' , type= ' %s ' , descr= ' %s ' where name= ' %s ' """ %
( module , type , desc , name ) )
except :
2011-02-16 18:57:50 +03:00
print " Update symbol ( %s , %s , %s ) failed command " % ( name , module , type )
print """ UPDATE symbols SET module= ' %s ' , type= ' %s ' , descr= ' %s ' where name= ' %s ' """ % ( module , type , desc , name )
print sys . exc_type , sys . exc_value
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ret
2008-02-05 22:27:37 +03:00
2018-03-20 09:48:54 +03:00
def addFunction ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' function ' , desc )
2018-03-20 09:48:54 +03:00
def addMacro ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' macro ' , desc )
2018-03-20 09:48:54 +03:00
def addEnum ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' enum ' , desc )
2018-03-20 09:48:54 +03:00
def addStruct ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' struct ' , desc )
2018-03-20 09:48:54 +03:00
def addConst ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' const ' , desc )
2018-03-20 09:48:54 +03:00
def addType ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' type ' , desc )
2018-03-20 09:48:54 +03:00
def addFunctype ( name , module , desc = " " ) :
2005-12-09 17:41:48 +03:00
return updateSymbol ( name , module , ' functype ' , desc )
def addPage ( resource , title ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if resource is None :
2005-12-09 17:41:48 +03:00
return - 1
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
""" INSERT INTO pages (resource, title) VALUES ( ' %s ' , ' %s ' ) """ %
2005-12-09 17:41:48 +03:00
( resource , title ) )
except :
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
""" UPDATE pages SET title= ' %s ' WHERE resource= ' %s ' """ %
2005-12-09 17:41:48 +03:00
( title , resource ) )
except :
2011-02-16 18:57:50 +03:00
print " Update symbol ( %s , %s , %s ) failed command " % ( name , module , type )
print """ UPDATE pages SET title= ' %s ' WHERE resource= ' %s ' """ % ( title , resource )
print sys . exc_type , sys . exc_value
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ret
def updateWordHTML ( name , resource , desc , id , relevance ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if name is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if resource is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if id is None :
2005-12-09 17:41:48 +03:00
id = " "
2013-08-22 13:16:03 +04:00
if desc is None :
2005-12-09 17:41:48 +03:00
desc = " "
else :
2011-02-16 18:57:50 +03:00
try :
desc = string . replace ( desc , " ' " , " " )
desc = desc [ 0 : 99 ]
except :
desc = " "
2005-12-09 17:41:48 +03:00
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" INSERT INTO wordsHTML (name, resource, section, id, relevance) VALUES ( ' %s ' , ' %s ' , ' %s ' , ' %s ' , ' %d ' ) """ %
( name , resource , desc , id , relevance ) )
except :
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" UPDATE wordsHTML SET section= ' %s ' , id= ' %s ' , relevance= ' %d ' where name= ' %s ' and resource= ' %s ' """ %
( desc , id , relevance , name , resource ) )
except :
2011-02-16 18:57:50 +03:00
print " Update symbol ( %s , %s , %d ) failed command " % ( name , resource , relevance )
print """ UPDATE wordsHTML SET section= ' %s ' , id= ' %s ' , relevance= ' %d ' where name= ' %s ' and resource= ' %s ' """ % ( desc , id , relevance , name , resource )
print sys . exc_type , sys . exc_value
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ret
def checkXMLMsgArchive ( url ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if url is None :
2005-12-09 17:41:48 +03:00
return - 1
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
""" SELECT ID FROM archives WHERE resource= ' %s ' """ % ( url ) )
row = c . fetchone ( )
2013-08-22 13:16:03 +04:00
if row is None :
2011-02-16 18:57:50 +03:00
return - 1
2005-12-09 17:41:48 +03:00
except :
2011-02-16 18:57:50 +03:00
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return row [ 0 ]
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def addXMLMsgArchive ( url , title ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if url is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if title is None :
2005-12-09 17:41:48 +03:00
title = " "
else :
2011-02-16 18:57:50 +03:00
title = string . replace ( title , " ' " , " " )
title = title [ 0 : 99 ]
2005-12-09 17:41:48 +03:00
c = DB . cursor ( )
try :
cmd = """ INSERT INTO archives (resource, title) VALUES ( ' %s ' , ' %s ' ) """ % ( url , title )
ret = c . execute ( cmd )
2011-02-16 18:57:50 +03:00
cmd = """ SELECT ID FROM archives WHERE resource= ' %s ' """ % ( url )
2005-12-09 17:41:48 +03:00
ret = c . execute ( cmd )
2011-02-16 18:57:50 +03:00
row = c . fetchone ( )
2013-08-22 13:16:03 +04:00
if row is None :
2011-02-16 18:57:50 +03:00
print " addXMLMsgArchive failed to get the ID: %s " % ( url )
return - 1
2005-12-09 17:41:48 +03:00
except :
print " addXMLMsgArchive failed command: %s " % ( cmd )
2011-02-16 18:57:50 +03:00
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ( ( int ) ( row [ 0 ] ) )
def updateWordArchive ( name , id , relevance ) :
global DB
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
openMySQL ( )
2013-08-22 13:16:03 +04:00
if DB is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if name is None :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if id is None :
2005-12-09 17:41:48 +03:00
return - 1
c = DB . cursor ( )
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" INSERT INTO wordsArchive (name, id, relevance) VALUES ( ' %s ' , ' %d ' , ' %d ' ) """ %
( name , id , relevance ) )
except :
try :
2011-02-16 18:57:50 +03:00
ret = c . execute (
2005-12-09 17:41:48 +03:00
""" UPDATE wordsArchive SET relevance= ' %d ' where name= ' %s ' and ID= ' %d ' """ %
( relevance , name , id ) )
except :
2011-02-16 18:57:50 +03:00
print " Update word archive ( %s , %d , %d ) failed command " % ( name , id , relevance )
print """ UPDATE wordsArchive SET relevance= ' %d ' where name= ' %s ' and ID= ' %d ' """ % ( relevance , name , id )
print sys . exc_type , sys . exc_value
return - 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
return ret
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# Word dictionary and analysis routines #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
#
# top 100 english word without the one len < 3 + own set
#
dropWords = {
' the ' : 0 , ' this ' : 0 , ' can ' : 0 , ' man ' : 0 , ' had ' : 0 , ' him ' : 0 , ' only ' : 0 ,
' and ' : 0 , ' not ' : 0 , ' been ' : 0 , ' other ' : 0 , ' even ' : 0 , ' are ' : 0 , ' was ' : 0 ,
' new ' : 0 , ' most ' : 0 , ' but ' : 0 , ' when ' : 0 , ' some ' : 0 , ' made ' : 0 , ' from ' : 0 ,
' who ' : 0 , ' could ' : 0 , ' after ' : 0 , ' that ' : 0 , ' will ' : 0 , ' time ' : 0 , ' also ' : 0 ,
' have ' : 0 , ' more ' : 0 , ' these ' : 0 , ' did ' : 0 , ' was ' : 0 , ' two ' : 0 , ' many ' : 0 ,
' they ' : 0 , ' may ' : 0 , ' before ' : 0 , ' for ' : 0 , ' which ' : 0 , ' out ' : 0 , ' then ' : 0 ,
' must ' : 0 , ' one ' : 0 , ' through ' : 0 , ' with ' : 0 , ' you ' : 0 , ' said ' : 0 ,
' first ' : 0 , ' back ' : 0 , ' were ' : 0 , ' what ' : 0 , ' any ' : 0 , ' years ' : 0 , ' his ' : 0 ,
' her ' : 0 , ' where ' : 0 , ' all ' : 0 , ' its ' : 0 , ' now ' : 0 , ' much ' : 0 , ' she ' : 0 ,
' about ' : 0 , ' such ' : 0 , ' your ' : 0 , ' there ' : 0 , ' into ' : 0 , ' like ' : 0 , ' may ' : 0 ,
' would ' : 0 , ' than ' : 0 , ' our ' : 0 , ' well ' : 0 , ' their ' : 0 , ' them ' : 0 , ' over ' : 0 ,
' down ' : 0 ,
' net ' : 0 , ' www ' : 0 , ' bad ' : 0 , ' Okay ' : 0 , ' bin ' : 0 , ' cur ' : 0 ,
}
wordsDict = { }
wordsDictHTML = { }
wordsDictArchive = { }
def cleanupWordsString ( str ) :
str = string . replace ( str , " . " , " " )
str = string . replace ( str , " ! " , " " )
str = string . replace ( str , " ? " , " " )
str = string . replace ( str , " , " , " " )
str = string . replace ( str , " ' " , " " )
str = string . replace ( str , ' " ' , " " )
str = string . replace ( str , " ; " , " " )
str = string . replace ( str , " ( " , " " )
str = string . replace ( str , " ) " , " " )
str = string . replace ( str , " { " , " " )
str = string . replace ( str , " } " , " " )
str = string . replace ( str , " < " , " " )
str = string . replace ( str , " > " , " " )
str = string . replace ( str , " = " , " " )
str = string . replace ( str , " / " , " " )
str = string . replace ( str , " * " , " " )
str = string . replace ( str , " : " , " " )
str = string . replace ( str , " # " , " " )
str = string . replace ( str , " \\ " , " " )
str = string . replace ( str , " \n " , " " )
str = string . replace ( str , " \r " , " " )
str = string . replace ( str , " \xc2 " , " " )
str = string . replace ( str , " \xa0 " , " " )
return str
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def cleanupDescrString ( str ) :
str = string . replace ( str , " ' " , " " )
str = string . replace ( str , " \n " , " " )
str = string . replace ( str , " \r " , " " )
str = string . replace ( str , " \xc2 " , " " )
str = string . replace ( str , " \xa0 " , " " )
l = string . split ( str )
str = string . join ( str )
return str
def splitIdentifier ( str ) :
ret = [ ]
while str != " " :
cur = string . lower ( str [ 0 ] )
2011-02-16 18:57:50 +03:00
str = str [ 1 : ]
if ( ( cur < ' a ' ) or ( cur > ' z ' ) ) :
continue
while ( str != " " ) and ( str [ 0 ] > = ' A ' ) and ( str [ 0 ] < = ' Z ' ) :
cur = cur + string . lower ( str [ 0 ] )
str = str [ 1 : ]
while ( str != " " ) and ( str [ 0 ] > = ' a ' ) and ( str [ 0 ] < = ' z ' ) :
cur = cur + str [ 0 ]
str = str [ 1 : ]
while ( str != " " ) and ( str [ 0 ] > = ' 0 ' ) and ( str [ 0 ] < = ' 9 ' ) :
str = str [ 1 : ]
ret . append ( cur )
2005-12-09 17:41:48 +03:00
return ret
def addWord ( word , module , symbol , relevance ) :
global wordsDict
2013-08-22 13:16:03 +04:00
if word is None or len ( word ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if module is None or symbol is None :
2005-12-09 17:41:48 +03:00
return - 1
if dropWords . has_key ( word ) :
return 0
if ord ( word [ 0 ] ) > 0x80 :
return 0
if wordsDict . has_key ( word ) :
d = wordsDict [ word ]
2013-08-22 13:16:03 +04:00
if d is None :
2011-02-16 18:57:50 +03:00
return 0
if len ( d ) > 500 :
wordsDict [ word ] = None
return 0
try :
relevance = relevance + d [ ( module , symbol ) ]
except :
pass
2005-12-09 17:41:48 +03:00
else :
wordsDict [ word ] = { }
wordsDict [ word ] [ ( module , symbol ) ] = relevance
return relevance
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def addString ( str , module , symbol , relevance ) :
2013-08-22 13:16:03 +04:00
if str is None or len ( str ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
ret = 0
str = cleanupWordsString ( str )
l = string . split ( str )
for word in l :
2011-02-16 18:57:50 +03:00
if len ( word ) > 2 :
ret = ret + addWord ( word , module , symbol , 5 )
2005-12-09 17:41:48 +03:00
return ret
def addWordHTML ( word , resource , id , section , relevance ) :
global wordsDictHTML
2013-08-22 13:16:03 +04:00
if word is None or len ( word ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if resource is None or section is None :
2005-12-09 17:41:48 +03:00
return - 1
if dropWords . has_key ( word ) :
return 0
if ord ( word [ 0 ] ) > 0x80 :
return 0
section = cleanupDescrString ( section )
if wordsDictHTML . has_key ( word ) :
d = wordsDictHTML [ word ]
2013-08-22 13:16:03 +04:00
if d is None :
2011-02-16 18:57:50 +03:00
print " skipped %s " % ( word )
return 0
try :
( r , i , s ) = d [ resource ]
2013-08-22 13:16:03 +04:00
if i is not None :
2011-02-16 18:57:50 +03:00
id = i
2013-08-22 13:16:03 +04:00
if s is not None :
2011-02-16 18:57:50 +03:00
section = s
relevance = relevance + r
except :
pass
2005-12-09 17:41:48 +03:00
else :
wordsDictHTML [ word ] = { }
2013-02-07 11:22:01 +04:00
d = wordsDictHTML [ word ]
2005-12-09 17:41:48 +03:00
d [ resource ] = ( relevance , id , section )
return relevance
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def addStringHTML ( str , resource , id , section , relevance ) :
2013-08-22 13:16:03 +04:00
if str is None or len ( str ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
ret = 0
str = cleanupWordsString ( str )
l = string . split ( str )
for word in l :
2011-02-16 18:57:50 +03:00
if len ( word ) > 2 :
try :
r = addWordHTML ( word , resource , id , section , relevance )
if r < 0 :
print " addWordHTML failed: %s %s " % ( word , resource )
ret = ret + r
except :
print " addWordHTML failed: %s %s %d " % ( word , resource , relevance )
print sys . exc_type , sys . exc_value
2005-12-09 17:41:48 +03:00
return ret
def addWordArchive ( word , id , relevance ) :
global wordsDictArchive
2013-08-22 13:16:03 +04:00
if word is None or len ( word ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
2013-08-22 13:16:03 +04:00
if id is None or id == - 1 :
2005-12-09 17:41:48 +03:00
return - 1
if dropWords . has_key ( word ) :
return 0
if ord ( word [ 0 ] ) > 0x80 :
return 0
if wordsDictArchive . has_key ( word ) :
d = wordsDictArchive [ word ]
2013-08-22 13:16:03 +04:00
if d is None :
2011-02-16 18:57:50 +03:00
print " skipped %s " % ( word )
return 0
try :
r = d [ id ]
relevance = relevance + r
except :
pass
2005-12-09 17:41:48 +03:00
else :
wordsDictArchive [ word ] = { }
2013-02-07 11:22:01 +04:00
d = wordsDictArchive [ word ]
2005-12-09 17:41:48 +03:00
d [ id ] = relevance
return relevance
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def addStringArchive ( str , id , relevance ) :
2013-08-22 13:16:03 +04:00
if str is None or len ( str ) < 3 :
2005-12-09 17:41:48 +03:00
return - 1
ret = 0
str = cleanupWordsString ( str )
l = string . split ( str )
for word in l :
i = len ( word )
2011-02-16 18:57:50 +03:00
if i > 2 :
try :
r = addWordArchive ( word , id , relevance )
if r < 0 :
print " addWordArchive failed: %s %s " % ( word , id )
else :
ret = ret + r
except :
print " addWordArchive failed: %s %s %d " % ( word , id , relevance )
print sys . exc_type , sys . exc_value
2005-12-09 17:41:48 +03:00
return ret
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# XML API description analysis #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
def loadAPI ( filename ) :
doc = libxml2 . parseFile ( filename )
print " loaded %s " % ( filename )
return doc
def foundExport ( file , symbol ) :
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addFunction ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
def analyzeAPIFile ( top ) :
count = 0
name = top . prop ( " name " )
cur = top . children
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " exports " :
count = count + foundExport ( name , cur . prop ( " symbol " ) )
else :
print " unexpected element %s in API doc <file name= ' %s ' > " % ( name )
2005-12-09 17:41:48 +03:00
cur = cur . next
return count
def analyzeAPIFiles ( top ) :
count = 0
cur = top . children
2008-02-05 22:27:37 +03:00
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " file " :
count = count + analyzeAPIFile ( cur )
else :
print " unexpected element %s in API doc <files> " % ( cur . name )
2005-12-09 17:41:48 +03:00
cur = cur . next
return count
def analyzeAPIEnum ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addEnum ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIConst ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addConst ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIType ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addType ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIFunctype ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addFunctype ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIStruct ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
addStruct ( symbol , file )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
info = top . prop ( " info " )
2013-08-22 13:16:03 +04:00
if info is not None :
2011-02-16 18:57:50 +03:00
info = string . replace ( info , " ' " , " " )
info = string . strip ( info )
l = string . split ( info )
for word in l :
if len ( word ) > 2 :
addWord ( word , file , symbol , 5 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIMacro ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = string . replace ( symbol , " ' " , " " )
symbol = string . strip ( symbol )
info = None
cur = top . children
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " info " :
info = cur . content
break
2005-12-09 17:41:48 +03:00
cur = cur . next
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
2013-08-22 13:16:03 +04:00
if info is None :
2011-02-16 18:57:50 +03:00
addMacro ( symbol , file )
2005-12-09 17:41:48 +03:00
print " Macro %s description has no <info> " % ( symbol )
return 0
info = string . replace ( info , " ' " , " " )
info = string . strip ( info )
addMacro ( symbol , file , info )
l = string . split ( info )
for word in l :
2011-02-16 18:57:50 +03:00
if len ( word ) > 2 :
addWord ( word , file , symbol , 5 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPIFunction ( top ) :
file = top . prop ( " file " )
2013-08-22 13:16:03 +04:00
if file is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = top . prop ( " name " )
2013-08-22 13:16:03 +04:00
if symbol is None :
2005-12-09 17:41:48 +03:00
return 0
symbol = string . replace ( symbol , " ' " , " " )
symbol = string . strip ( symbol )
info = None
cur = top . children
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " info " :
info = cur . content
elif cur . name == " return " :
rinfo = cur . prop ( " info " )
2013-08-22 13:16:03 +04:00
if rinfo is not None :
2011-02-16 18:57:50 +03:00
rinfo = string . replace ( rinfo , " ' " , " " )
rinfo = string . strip ( rinfo )
addString ( rinfo , file , symbol , 7 )
elif cur . name == " arg " :
ainfo = cur . prop ( " info " )
2013-08-22 13:16:03 +04:00
if ainfo is not None :
2011-02-16 18:57:50 +03:00
ainfo = string . replace ( ainfo , " ' " , " " )
ainfo = string . strip ( ainfo )
addString ( ainfo , file , symbol , 5 )
name = cur . prop ( " name " )
2013-08-22 13:16:03 +04:00
if name is not None :
2011-02-16 18:57:50 +03:00
name = string . replace ( name , " ' " , " " )
name = string . strip ( name )
addWord ( name , file , symbol , 7 )
2005-12-09 17:41:48 +03:00
cur = cur . next
2013-08-22 13:16:03 +04:00
if info is None :
2005-12-09 17:41:48 +03:00
print " Function %s description has no <info> " % ( symbol )
2011-02-16 18:57:50 +03:00
addFunction ( symbol , file , " " )
2005-12-09 17:41:48 +03:00
else :
info = string . replace ( info , " ' " , " " )
2011-02-16 18:57:50 +03:00
info = string . strip ( info )
addFunction ( symbol , file , info )
2005-12-09 17:41:48 +03:00
addString ( info , file , symbol , 5 )
l = splitIdentifier ( symbol )
for word in l :
2011-02-16 18:57:50 +03:00
addWord ( word , file , symbol , 10 )
2005-12-09 17:41:48 +03:00
return 1
def analyzeAPISymbols ( top ) :
count = 0
cur = top . children
2008-02-05 22:27:37 +03:00
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " macro " :
count = count + analyzeAPIMacro ( cur )
elif cur . name == " function " :
count = count + analyzeAPIFunction ( cur )
elif cur . name == " const " :
count = count + analyzeAPIConst ( cur )
elif cur . name == " typedef " :
count = count + analyzeAPIType ( cur )
elif cur . name == " struct " :
count = count + analyzeAPIStruct ( cur )
elif cur . name == " enum " :
count = count + analyzeAPIEnum ( cur )
elif cur . name == " functype " :
count = count + analyzeAPIFunctype ( cur )
else :
print " unexpected element %s in API doc <files> " % ( cur . name )
2005-12-09 17:41:48 +03:00
cur = cur . next
return count
def analyzeAPI ( doc ) :
count = 0
2013-08-22 13:16:03 +04:00
if doc is None :
2005-12-09 17:41:48 +03:00
return - 1
root = doc . getRootElement ( )
if root . name != " api " :
print " Unexpected root name "
return - 1
cur = root . children
2013-08-22 13:16:03 +04:00
while cur is not None :
2005-12-09 17:41:48 +03:00
if cur . type == ' text ' :
2011-02-16 18:57:50 +03:00
cur = cur . next
continue
if cur . name == " files " :
pass
# count = count + analyzeAPIFiles(cur)
elif cur . name == " symbols " :
count = count + analyzeAPISymbols ( cur )
else :
print " unexpected element %s in API doc " % ( cur . name )
2005-12-09 17:41:48 +03:00
cur = cur . next
return count
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# Web pages parsing and analysis #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
import glob
def analyzeHTMLText ( doc , resource , p , section , id ) :
words = 0
try :
2011-02-16 18:57:50 +03:00
content = p . content
words = words + addStringHTML ( content , resource , id , section , 5 )
2005-12-09 17:41:48 +03:00
except :
return - 1
return words
def analyzeHTMLPara ( doc , resource , p , section , id ) :
words = 0
try :
2011-02-16 18:57:50 +03:00
content = p . content
words = words + addStringHTML ( content , resource , id , section , 5 )
2005-12-09 17:41:48 +03:00
except :
return - 1
return words
def analyzeHTMLPre ( doc , resource , p , section , id ) :
words = 0
try :
2011-02-16 18:57:50 +03:00
content = p . content
words = words + addStringHTML ( content , resource , id , section , 5 )
2005-12-09 17:41:48 +03:00
except :
return - 1
return words
def analyzeHTML ( doc , resource , p , section , id ) :
words = 0
try :
2011-02-16 18:57:50 +03:00
content = p . content
words = words + addStringHTML ( content , resource , id , section , 5 )
2005-12-09 17:41:48 +03:00
except :
return - 1
return words
def analyzeHTML ( doc , resource ) :
2013-02-07 11:22:01 +04:00
para = 0
2005-12-09 17:41:48 +03:00
ctxt = doc . xpathNewContext ( )
try :
2011-02-16 18:57:50 +03:00
res = ctxt . xpathEval ( " //head/title " )
title = res [ 0 ] . content
2005-12-09 17:41:48 +03:00
except :
title = " Page %s " % ( resource )
addPage ( resource , title )
try :
2011-02-16 18:57:50 +03:00
items = ctxt . xpathEval ( " //h1 | //h2 | //h3 | //text() " )
section = title
id = " "
for item in items :
if item . name == ' h1 ' or item . name == ' h2 ' or item . name == ' h3 ' :
section = item . content
if item . prop ( " id " ) :
id = item . prop ( " id " )
elif item . prop ( " name " ) :
id = item . prop ( " name " )
elif item . type == ' text ' :
analyzeHTMLText ( doc , resource , item , section , id )
para = para + 1
elif item . name == ' p ' :
analyzeHTMLPara ( doc , resource , item , section , id )
para = para + 1
elif item . name == ' pre ' :
analyzeHTMLPre ( doc , resource , item , section , id )
para = para + 1
else :
print " Page %s , unexpected %s element " % ( resource , item . name )
2005-12-09 17:41:48 +03:00
except :
print " Page %s : problem analyzing " % ( resource )
2011-02-16 18:57:50 +03:00
print sys . exc_type , sys . exc_value
2005-12-09 17:41:48 +03:00
return para
def analyzeHTMLPages ( ) :
ret = 0
2008-04-24 11:56:37 +04:00
HTMLfiles = glob . glob ( " *.html " ) + glob . glob ( " tutorial/*.html " ) + \
glob . glob ( " CIM/*.html " ) + glob . glob ( " ocaml/*.html " ) + \
2011-02-16 18:57:50 +03:00
glob . glob ( " ruby/*.html " )
2005-12-09 17:41:48 +03:00
for html in HTMLfiles :
2011-02-16 18:57:50 +03:00
if html [ 0 : 3 ] == " API " :
continue
if html == " xml.html " :
continue
try :
doc = libxml2 . parseFile ( html )
except :
doc = libxml2 . htmlParseFile ( html , None )
try :
res = analyzeHTML ( doc , html )
2018-03-20 09:48:46 +03:00
print " Parsed %s : %d paragraphs " % ( html , res )
2011-02-16 18:57:50 +03:00
ret = ret + 1
except :
print " could not parse %s " % ( html )
2005-12-09 17:41:48 +03:00
return ret
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# Mail archives parsing and analysis #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
import time
2018-03-20 09:48:54 +03:00
def getXMLDateArchive ( t = None ) :
2013-08-22 13:16:03 +04:00
if t is None :
2011-02-16 18:57:50 +03:00
t = time . time ( )
2005-12-09 17:41:48 +03:00
T = time . gmtime ( t )
month = time . strftime ( " % B " , T )
year = T [ 0 ]
url = " http://www.redhat.com/archives/libvir-list/ %d - %s /date.html " % ( year , month )
return url
2018-03-20 09:48:54 +03:00
def scanXMLMsgArchive ( url , title , force = 0 ) :
2013-08-22 13:16:03 +04:00
if url is None or title is None :
2005-12-09 17:41:48 +03:00
return 0
ID = checkXMLMsgArchive ( url )
if force == 0 and ID != - 1 :
return 0
if ID == - 1 :
2011-02-16 18:57:50 +03:00
ID = addXMLMsgArchive ( url , title )
if ID == - 1 :
return 0
2005-12-09 17:41:48 +03:00
try :
print " Loading %s " % ( url )
2013-02-07 11:22:01 +04:00
doc = libxml2 . htmlParseFile ( url , None )
2005-12-09 17:41:48 +03:00
except :
doc = None
2013-08-22 13:16:03 +04:00
if doc is None :
2005-12-09 17:41:48 +03:00
print " Failed to parse %s " % ( url )
2011-02-16 18:57:50 +03:00
return 0
2005-12-09 17:41:48 +03:00
addStringArchive ( title , ID , 20 )
ctxt = doc . xpathNewContext ( )
texts = ctxt . xpathEval ( " //pre//text() " )
for text in texts :
addStringArchive ( text . content , ID , 5 )
return 1
2018-03-20 09:48:54 +03:00
def scanXMLDateArchive ( t = None , force = 0 ) :
2005-12-09 17:41:48 +03:00
global wordsDictArchive
wordsDictArchive = { }
url = getXMLDateArchive ( t )
print " loading %s " % ( url )
try :
2013-02-07 11:22:01 +04:00
doc = libxml2 . htmlParseFile ( url , None )
2005-12-09 17:41:48 +03:00
except :
doc = None
2013-08-22 13:16:03 +04:00
if doc is None :
2005-12-09 17:41:48 +03:00
print " Failed to parse %s " % ( url )
2011-02-16 18:57:50 +03:00
return - 1
2005-12-09 17:41:48 +03:00
ctxt = doc . xpathNewContext ( )
anchors = ctxt . xpathEval ( " //a[@href] " )
links = 0
newmsg = 0
for anchor in anchors :
2011-02-16 18:57:50 +03:00
href = anchor . prop ( " href " )
2013-08-22 13:16:03 +04:00
if href is None or href [ 0 : 3 ] != " msg " :
2011-02-16 18:57:50 +03:00
continue
2005-12-09 17:41:48 +03:00
try :
2011-02-16 18:57:50 +03:00
links = links + 1
2005-12-09 17:41:48 +03:00
2011-02-16 18:57:50 +03:00
msg = libxml2 . buildURI ( href , url )
title = anchor . content
2013-08-22 13:16:03 +04:00
if title is not None and title [ 0 : 4 ] == ' Re: ' :
2011-02-16 18:57:50 +03:00
title = title [ 4 : ]
2013-08-22 13:16:03 +04:00
if title is not None and title [ 0 : 6 ] == ' [xml] ' :
2011-02-16 18:57:50 +03:00
title = title [ 6 : ]
newmsg = newmsg + scanXMLMsgArchive ( msg , title , force )
2005-12-09 17:41:48 +03:00
2011-02-16 18:57:50 +03:00
except :
pass
2005-12-09 17:41:48 +03:00
return newmsg
2008-02-05 22:27:37 +03:00
2005-12-09 17:41:48 +03:00
#########################################################################
2011-02-16 18:57:50 +03:00
# #
# Main code: open the DB, the API XML and analyze it #
# #
2005-12-09 17:41:48 +03:00
#########################################################################
2018-03-20 09:48:54 +03:00
def analyzeArchives ( t = None , force = 0 ) :
2005-12-09 17:41:48 +03:00
global wordsDictArchive
ret = scanXMLDateArchive ( t , force )
print " Indexed %d words in %d archive pages " % ( len ( wordsDictArchive ) , ret )
i = 0
skipped = 0
for word in wordsDictArchive . keys ( ) :
2011-02-16 18:57:50 +03:00
refs = wordsDictArchive [ word ]
2013-08-22 13:16:03 +04:00
if refs is None :
2011-02-16 18:57:50 +03:00
skipped = skipped + 1
2013-02-07 11:22:01 +04:00
continue
2011-02-16 18:57:50 +03:00
for id in refs . keys ( ) :
relevance = refs [ id ]
updateWordArchive ( word , id , relevance )
i = i + 1
2005-12-09 17:41:48 +03:00
print " Found %d associations in HTML pages " % ( i )
def analyzeHTMLTop ( ) :
global wordsDictHTML
ret = analyzeHTMLPages ( )
print " Indexed %d words in %d HTML pages " % ( len ( wordsDictHTML ) , ret )
i = 0
skipped = 0
for word in wordsDictHTML . keys ( ) :
2011-02-16 18:57:50 +03:00
refs = wordsDictHTML [ word ]
2013-08-22 13:16:03 +04:00
if refs is None :
2011-02-16 18:57:50 +03:00
skipped = skipped + 1
2013-02-07 11:22:01 +04:00
continue
2011-02-16 18:57:50 +03:00
for resource in refs . keys ( ) :
( relevance , id , section ) = refs [ resource ]
updateWordHTML ( word , resource , section , id , relevance )
i = i + 1
2005-12-09 17:41:48 +03:00
print " Found %d associations in HTML pages " % ( i )
def analyzeAPITop ( ) :
global wordsDict
global API
try :
2011-02-16 18:57:50 +03:00
doc = loadAPI ( API )
ret = analyzeAPI ( doc )
print " Analyzed %d blocs " % ( ret )
doc . freeDoc ( )
2005-12-09 17:41:48 +03:00
except :
2011-02-16 18:57:50 +03:00
print " Failed to parse and analyze %s " % ( API )
print sys . exc_type , sys . exc_value
sys . exit ( 1 )
2005-12-09 17:41:48 +03:00
print " Indexed %d words " % ( len ( wordsDict ) )
i = 0
skipped = 0
for word in wordsDict . keys ( ) :
2011-02-16 18:57:50 +03:00
refs = wordsDict [ word ]
2013-08-22 13:16:03 +04:00
if refs is None :
2011-02-16 18:57:50 +03:00
skipped = skipped + 1
2013-02-07 11:22:01 +04:00
continue
2011-02-16 18:57:50 +03:00
for ( module , symbol ) in refs . keys ( ) :
updateWord ( word , symbol , refs [ ( module , symbol ) ] )
i = i + 1
2005-12-09 17:41:48 +03:00
print " Found %d associations, skipped %d words " % ( i , skipped )
def usage ( ) :
print " Usage index.py [--force] [--archive] [--archive-year year] [--archive-month month] [--API] [--docs] "
sys . exit ( 1 )
def main ( ) :
try :
2011-02-16 18:57:50 +03:00
openMySQL ( )
2005-12-09 17:41:48 +03:00
except :
2011-02-16 18:57:50 +03:00
print " Failed to open the database "
print sys . exc_type , sys . exc_value
sys . exit ( 1 )
2005-12-09 17:41:48 +03:00
args = sys . argv [ 1 : ]
force = 0
if args :
i = 0
2011-02-16 18:57:50 +03:00
while i < len ( args ) :
if args [ i ] == ' --force ' :
force = 1
elif args [ i ] == ' --archive ' :
analyzeArchives ( None , force )
elif args [ i ] == ' --archive-year ' :
2013-02-07 11:22:01 +04:00
i = i + 1
2011-02-16 18:57:50 +03:00
year = args [ i ]
2018-03-20 09:48:46 +03:00
months = [ " January " , " February " , " March " , " April " , " May " ,
2011-02-16 18:57:50 +03:00
" June " , " July " , " August " , " September " , " October " ,
2013-02-07 11:22:01 +04:00
" November " , " December " ]
2011-02-16 18:57:50 +03:00
for month in months :
try :
str = " %s - %s " % ( year , month )
T = time . strptime ( str , " % Y- % B " )
2013-02-07 11:22:01 +04:00
t = time . mktime ( T ) + 3600 * 24 * 10
2011-02-16 18:57:50 +03:00
analyzeArchives ( t , force )
except :
print " Failed to index month archive: "
print sys . exc_type , sys . exc_value
elif args [ i ] == ' --archive-month ' :
2013-02-07 11:22:01 +04:00
i = i + 1
2011-02-16 18:57:50 +03:00
month = args [ i ]
try :
T = time . strptime ( month , " % Y- % B " )
2013-02-07 11:22:01 +04:00
t = time . mktime ( T ) + 3600 * 24 * 10
2011-02-16 18:57:50 +03:00
analyzeArchives ( t , force )
except :
print " Failed to index month archive: "
print sys . exc_type , sys . exc_value
elif args [ i ] == ' --API ' :
analyzeAPITop ( )
elif args [ i ] == ' --docs ' :
analyzeHTMLTop ( )
else :
usage ( )
i = i + 1
2005-12-09 17:41:48 +03:00
else :
usage ( )
if __name__ == " __main__ " :
main ( )