V
V
Vladimir Grabko2016-06-13 20:37:09
go
Vladimir Grabko, 2016-06-13 20:37:09

Why does the dictionary eat up so much memory?

I had a simple task. Parse emails on pages. In order not to re-add emails to the database, I made two maps

var storageUrl map[string]int
var storageEmail map[string]int

in them, the key is url or email, and the value is always 0. Now the map with url has 2204 elements, and with emails 2289 elements. Total memory 1075mb. What can be done?
CAREFULLY! My eyes are bleeding from my code.
main
spoiler
package main

import (
  "bufio"
  //"bytes"
  "fmt"
  //"log"
  "os"
  "parser/parse"
  "time"
)

var storageUrl map[string]int
var storageEmail map[string]int

func main() {
  storageUrl = make(map[string]int, 10240)
  storageEmail = make(map[string]int)
  queueLoadUrl := make(chan string)
  queueParseUrlHtml := make(chan string)
  //queueParseEmailHtml := make(chan string)

  queueStorageUrl := make(chan string)
  queueStorageEmail := make(chan string)

  defer func() {
    if r := recover(); r != nil {
      ReplacationFileSystem()
      os.Exit(1)
    }
  }()
  for i := 0; i < 10; i++ {
    go func() {
      for {
        msg := <-queueLoadUrl
        _, ok := storageUrl[msg]
        if !ok {
          queueStorageUrl <- msg
          queueParseUrlHtml <- parse.LoadUrl(msg)
        }
      }
    }()

    //эта рутина занята парсингом
    go func() {
      for {
        msg := <-queueParseUrlHtml
        go func() {
          //поиск емейлов.
          for _, value := range parse.EmailHtml(msg) {
            queueStorageEmail <- value
          }
          //парсим все ссылки и отдаём на загрузку.
          //что то вроде рекурсии
          for _, value := range parse.UrlHtml(msg) {
            queueLoadUrl <- value
          }
        }()
      }
    }()
  }

  //эта рутина обновляет данные в Storage
  go func() {
    for {
      select {
      case msg1 := <-queueStorageUrl:
        storageUrl[msg1] = 0
      case msg2 := <-queueStorageEmail:
        _, ok := storageEmail[msg2]
        if !ok {
          storageEmail[msg2] = 0
        }
      }
    }
  }()

  //рутина занята репликацией.

  reader := bufio.NewReader(os.Stdin)
  fmt.Print("Фраза или url: ")
  text, _ := reader.ReadString('\n')
  fmt.Println("Начал парсинг")
  queueLoadUrl <- text

  for {
    time.Sleep(25000 * time.Millisecond)
    ReplacationFileSystem()
  }

}

func ReplacationFileSystem() {
  file, _ := os.Create("email.txt")
  defer file.Close()
  var buf string
  for key, _ := range storageEmail {
    buf += key + "\n"
  }
  fmt.Println("Распарсенно страниц: ", len(storageUrl), "\nВсего найдено адресов: ", len(storageEmail))
  file.WriteString(buf)
}


parse
spoiler
package parse

import (
  "io/ioutil"
  "log"
  "net/http"
  "net/url"
  "regexp"
  "strings"
  "sync"

  "golang.org/x/net/html"
)

var Host string

func LoadUrl(urls string) string {
  u, err := url.Parse(strings.TrimSpace(urls))
  if err != nil {
    return ""
  }
  rw := &sync.RWMutex{}
  if u.Host == "" {
    rw.RLock()
    u.Host = Host
    rw.RUnlock()
  } else {
    rw.Lock()
    Host = u.Host
    rw.Unlock()
  }

  if u.Scheme == "" {
    u.Scheme = "http"
  }

  res, err := http.Get(u.String())
  if err != nil {
    return ""
  }
  d, err := ioutil.ReadAll(res.Body)
  defer res.Body.Close()
  if err != nil {
    return ""
  }
  return string(d)
}

func UrlHtml(s string) map[int]string {
  urls := map[int]string{}
  count := 0

  doc, err := html.Parse(strings.NewReader(s))
  if err != nil {
    log.Fatal(err)
  }
  var f func(*html.Node)
  f = func(n *html.Node) {
    if n.Type == html.ElementNode && n.Data == "a" {
      for _, a := range n.Attr {
        if a.Key == "href" {
          urls[count] = a.Val
          count++
          break
        }
      }
    }
    for c := n.FirstChild; c != nil; c = c.NextSibling {
      f(c)
    }
  }
  f(doc)

  return urls
}

func EmailHtml(str string) []string {
  r := regexp.MustCompile("([a-z0-9_\\.\\-]+)\\@(([a-z0-9\\-])+\\.)+([a-z]{2,6})")
  return r.FindAllString(str, -1)
}

Answer the question

In order to leave comments, you need to log in

3 answer(s)
N
Nikita, 2016-06-13
@VGrabko

Perhaps this is offtopic, and maybe the cause of the disease, but It's just tin)

//parse
rw := &sync.RWMutex{}
  if u.Host == "" {
    rw.RLock()
    u.Host = Host
    rw.RUnlock()

Is that okay? I hardly understand what is here, but it seems inside the closure is superfluous.
go func() {
      for {
        msg := <-queueParseUrlHtml
        go func() { // <--------

V
Vladimir, 2016-06-13
@rostel

discover memcached or redis
it will certainly be slower, but it will save you from having to store a bunch of crap in runtime and losing data when the application
crashes redis will save data even if the server is restarted

F
fastpars, 2016-06-13
@fastpars

I assume that you didn't close Body in http.Response.
To the question "Why doesn't the code work?" I suggest you do not be shy and show this very code.

Didn't find what you were looking for?

Ask your question

Ask a Question

731 491 924 answers to any question