Answer the question
In order to leave comments, you need to log in
Why does the dictionary eat up so much memory?
I had a simple task. Parse emails on pages. In order not to re-add emails to the database, I made two maps
var storageUrl map[string]int
var storageEmail map[string]int
package main
import (
"bufio"
//"bytes"
"fmt"
//"log"
"os"
"parser/parse"
"time"
)
var storageUrl map[string]int
var storageEmail map[string]int
func main() {
storageUrl = make(map[string]int, 10240)
storageEmail = make(map[string]int)
queueLoadUrl := make(chan string)
queueParseUrlHtml := make(chan string)
//queueParseEmailHtml := make(chan string)
queueStorageUrl := make(chan string)
queueStorageEmail := make(chan string)
defer func() {
if r := recover(); r != nil {
ReplacationFileSystem()
os.Exit(1)
}
}()
for i := 0; i < 10; i++ {
go func() {
for {
msg := <-queueLoadUrl
_, ok := storageUrl[msg]
if !ok {
queueStorageUrl <- msg
queueParseUrlHtml <- parse.LoadUrl(msg)
}
}
}()
//эта рутина занята парсингом
go func() {
for {
msg := <-queueParseUrlHtml
go func() {
//поиск емейлов.
for _, value := range parse.EmailHtml(msg) {
queueStorageEmail <- value
}
//парсим все ссылки и отдаём на загрузку.
//что то вроде рекурсии
for _, value := range parse.UrlHtml(msg) {
queueLoadUrl <- value
}
}()
}
}()
}
//эта рутина обновляет данные в Storage
go func() {
for {
select {
case msg1 := <-queueStorageUrl:
storageUrl[msg1] = 0
case msg2 := <-queueStorageEmail:
_, ok := storageEmail[msg2]
if !ok {
storageEmail[msg2] = 0
}
}
}
}()
//рутина занята репликацией.
reader := bufio.NewReader(os.Stdin)
fmt.Print("Фраза или url: ")
text, _ := reader.ReadString('\n')
fmt.Println("Начал парсинг")
queueLoadUrl <- text
for {
time.Sleep(25000 * time.Millisecond)
ReplacationFileSystem()
}
}
func ReplacationFileSystem() {
file, _ := os.Create("email.txt")
defer file.Close()
var buf string
for key, _ := range storageEmail {
buf += key + "\n"
}
fmt.Println("Распарсенно страниц: ", len(storageUrl), "\nВсего найдено адресов: ", len(storageEmail))
file.WriteString(buf)
}
package parse
import (
"io/ioutil"
"log"
"net/http"
"net/url"
"regexp"
"strings"
"sync"
"golang.org/x/net/html"
)
var Host string
func LoadUrl(urls string) string {
u, err := url.Parse(strings.TrimSpace(urls))
if err != nil {
return ""
}
rw := &sync.RWMutex{}
if u.Host == "" {
rw.RLock()
u.Host = Host
rw.RUnlock()
} else {
rw.Lock()
Host = u.Host
rw.Unlock()
}
if u.Scheme == "" {
u.Scheme = "http"
}
res, err := http.Get(u.String())
if err != nil {
return ""
}
d, err := ioutil.ReadAll(res.Body)
defer res.Body.Close()
if err != nil {
return ""
}
return string(d)
}
func UrlHtml(s string) map[int]string {
urls := map[int]string{}
count := 0
doc, err := html.Parse(strings.NewReader(s))
if err != nil {
log.Fatal(err)
}
var f func(*html.Node)
f = func(n *html.Node) {
if n.Type == html.ElementNode && n.Data == "a" {
for _, a := range n.Attr {
if a.Key == "href" {
urls[count] = a.Val
count++
break
}
}
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
f(c)
}
}
f(doc)
return urls
}
func EmailHtml(str string) []string {
r := regexp.MustCompile("([a-z0-9_\\.\\-]+)\\@(([a-z0-9\\-])+\\.)+([a-z]{2,6})")
return r.FindAllString(str, -1)
}
Answer the question
In order to leave comments, you need to log in
Perhaps this is offtopic, and maybe the cause of the disease, but It's just tin)
//parse
rw := &sync.RWMutex{}
if u.Host == "" {
rw.RLock()
u.Host = Host
rw.RUnlock()
go func() {
for {
msg := <-queueParseUrlHtml
go func() { // <--------
discover memcached or redis
it will certainly be slower, but it will save you from having to store a bunch of crap in runtime and losing data when the application
crashes redis will save data even if the server is restarted
Didn't find what you were looking for?
Ask your questionAsk a Question
731 491 924 answers to any question